]> git.ipfire.org Git - thirdparty/systemd.git/blame - src/journal/journal-file.c
shell-completion: fix header
[thirdparty/systemd.git] / src / journal / journal-file.c
CommitLineData
cec736d2
LP
1/*-*- Mode: C; c-basic-offset: 8; indent-tabs-mode: nil -*-*/
2
3/***
4 This file is part of systemd.
5
6 Copyright 2011 Lennart Poettering
7
8 systemd is free software; you can redistribute it and/or modify it
5430f7f2
LP
9 under the terms of the GNU Lesser General Public License as published by
10 the Free Software Foundation; either version 2.1 of the License, or
cec736d2
LP
11 (at your option) any later version.
12
13 systemd is distributed in the hope that it will be useful, but
14 WITHOUT ANY WARRANTY; without even the implied warranty of
15 MERCHANTABILITY or FITNESS FOR A PARTICULAR PURPOSE. See the GNU
5430f7f2 16 Lesser General Public License for more details.
cec736d2 17
5430f7f2 18 You should have received a copy of the GNU Lesser General Public License
cec736d2
LP
19 along with systemd; If not, see <http://www.gnu.org/licenses/>.
20***/
21
cec736d2 22#include <errno.h>
cec736d2 23#include <fcntl.h>
11689d2a 24#include <linux/fs.h>
07630cea
LP
25#include <stddef.h>
26#include <sys/mman.h>
27#include <sys/statvfs.h>
28#include <sys/uio.h>
29#include <unistd.h>
fb0951b0 30
b5efdb8a 31#include "alloc-util.h"
f27a3864 32#include "btrfs-util.h"
c8b3094d 33#include "chattr-util.h"
07630cea 34#include "compress.h"
3ffd4af2 35#include "fd-util.h"
0284adc6 36#include "journal-authenticate.h"
cec736d2
LP
37#include "journal-def.h"
38#include "journal-file.h"
39#include "lookup3.h"
6bedfcbb 40#include "parse-util.h"
3df3e884 41#include "random-util.h"
7a24f3bf 42#include "sd-event.h"
07630cea 43#include "string-util.h"
89a5a90c 44#include "xattr-util.h"
cec736d2 45
4a92baf3
LP
46#define DEFAULT_DATA_HASH_TABLE_SIZE (2047ULL*sizeof(HashItem))
47#define DEFAULT_FIELD_HASH_TABLE_SIZE (333ULL*sizeof(HashItem))
cec736d2 48
be19b7df 49#define COMPRESSION_SIZE_THRESHOLD (512ULL)
807e17f0 50
babfc091 51/* This is the minimum journal file size */
16098e93 52#define JOURNAL_FILE_SIZE_MIN (512ULL*1024ULL) /* 512 KiB */
babfc091
LP
53
54/* These are the lower and upper bounds if we deduce the max_use value
55 * from the file system size */
56#define DEFAULT_MAX_USE_LOWER (1ULL*1024ULL*1024ULL) /* 1 MiB */
57#define DEFAULT_MAX_USE_UPPER (4ULL*1024ULL*1024ULL*1024ULL) /* 4 GiB */
58
8580d1f7
LP
59/* This is the default minimal use limit, how much we'll use even if keep_free suggests otherwise. */
60#define DEFAULT_MIN_USE (1ULL*1024ULL*1024ULL) /* 1 MiB */
61
babfc091 62/* This is the upper bound if we deduce max_size from max_use */
71100051 63#define DEFAULT_MAX_SIZE_UPPER (128ULL*1024ULL*1024ULL) /* 128 MiB */
babfc091
LP
64
65/* This is the upper bound if we deduce the keep_free value from the
66 * file system size */
67#define DEFAULT_KEEP_FREE_UPPER (4ULL*1024ULL*1024ULL*1024ULL) /* 4 GiB */
68
69/* This is the keep_free value when we can't determine the system
70 * size */
71#define DEFAULT_KEEP_FREE (1024ULL*1024ULL) /* 1 MB */
72
8580d1f7
LP
73/* This is the default maximum number of journal files to keep around. */
74#define DEFAULT_N_MAX_FILES (100)
75
dca6219e
LP
76/* n_data was the first entry we added after the initial file format design */
77#define HEADER_SIZE_MIN ALIGN64(offsetof(Header, n_data))
cec736d2 78
a4bcff5b
LP
79/* How many entries to keep in the entry array chain cache at max */
80#define CHAIN_CACHE_MAX 20
81
a676e665
LP
82/* How much to increase the journal file size at once each time we allocate something new. */
83#define FILE_SIZE_INCREASE (8ULL*1024ULL*1024ULL) /* 8MB */
84
2678031a
LP
85/* Reread fstat() of the file for detecting deletions at least this often */
86#define LAST_STAT_REFRESH_USEC (5*USEC_PER_SEC)
87
fa6ac760
LP
88/* The mmap context to use for the header we pick as one above the last defined typed */
89#define CONTEXT_HEADER _OBJECT_TYPE_MAX
90
9588bc32 91static int journal_file_set_online(JournalFile *f) {
26687bf8
OS
92 assert(f);
93
94 if (!f->writable)
95 return -EPERM;
96
97 if (!(f->fd >= 0 && f->header))
98 return -EINVAL;
99
fa6ac760
LP
100 if (mmap_cache_got_sigbus(f->mmap, f->fd))
101 return -EIO;
102
26687bf8
OS
103 switch(f->header->state) {
104 case STATE_ONLINE:
105 return 0;
106
107 case STATE_OFFLINE:
108 f->header->state = STATE_ONLINE;
109 fsync(f->fd);
110 return 0;
111
112 default:
113 return -EINVAL;
114 }
115}
116
117int journal_file_set_offline(JournalFile *f) {
118 assert(f);
119
120 if (!f->writable)
121 return -EPERM;
122
123 if (!(f->fd >= 0 && f->header))
124 return -EINVAL;
125
126 if (f->header->state != STATE_ONLINE)
127 return 0;
128
129 fsync(f->fd);
130
fa6ac760
LP
131 if (mmap_cache_got_sigbus(f->mmap, f->fd))
132 return -EIO;
133
26687bf8
OS
134 f->header->state = STATE_OFFLINE;
135
fa6ac760
LP
136 if (mmap_cache_got_sigbus(f->mmap, f->fd))
137 return -EIO;
138
26687bf8
OS
139 fsync(f->fd);
140
141 return 0;
142}
143
804ae586 144JournalFile* journal_file_close(JournalFile *f) {
de190aef 145 assert(f);
cec736d2 146
feb12d3e 147#ifdef HAVE_GCRYPT
b0af6f41 148 /* Write the final tag */
c586dbf1 149 if (f->seal && f->writable)
b0af6f41 150 journal_file_append_tag(f);
feb12d3e 151#endif
b0af6f41 152
7a24f3bf
VC
153 if (f->post_change_timer) {
154 int enabled;
155
156 if (sd_event_source_get_enabled(f->post_change_timer, &enabled) >= 0)
157 if (enabled == SD_EVENT_ONESHOT)
158 journal_file_post_change(f);
159
e167d7fd 160 (void) sd_event_source_set_enabled(f->post_change_timer, SD_EVENT_OFF);
7a24f3bf
VC
161 sd_event_source_unref(f->post_change_timer);
162 }
163
26687bf8 164 journal_file_set_offline(f);
cec736d2 165
fa6ac760
LP
166 if (f->mmap && f->fd >= 0)
167 mmap_cache_close_fd(f->mmap, f->fd);
cec736d2 168
11689d2a
LP
169 if (f->fd >= 0 && f->defrag_on_close) {
170
171 /* Be friendly to btrfs: turn COW back on again now,
172 * and defragment the file. We won't write to the file
173 * ever again, hence remove all fragmentation, and
174 * reenable all the good bits COW usually provides
175 * (such as data checksumming). */
176
1ed8f8c1 177 (void) chattr_fd(f->fd, 0, FS_NOCOW_FL);
11689d2a
LP
178 (void) btrfs_defrag_fd(f->fd);
179 }
f27a3864 180
03e334a1 181 safe_close(f->fd);
cec736d2 182 free(f->path);
807e17f0 183
f649045c 184 mmap_cache_unref(f->mmap);
16e9f408 185
4743015d 186 ordered_hashmap_free_free(f->chain_cache);
a4bcff5b 187
d89c8fdf 188#if defined(HAVE_XZ) || defined(HAVE_LZ4)
807e17f0
LP
189 free(f->compress_buffer);
190#endif
191
7560fffc 192#ifdef HAVE_GCRYPT
baed47c3
LP
193 if (f->fss_file)
194 munmap(f->fss_file, PAGE_ALIGN(f->fss_file_size));
dc4ebc07 195 else
b7c9ae91
LP
196 free(f->fsprg_state);
197
198 free(f->fsprg_seed);
7560fffc
LP
199
200 if (f->hmac)
201 gcry_md_close(f->hmac);
202#endif
203
cec736d2 204 free(f);
804ae586 205 return NULL;
cec736d2
LP
206}
207
0ac38b70 208static int journal_file_init_header(JournalFile *f, JournalFile *template) {
d89c8fdf 209 Header h = {};
cec736d2
LP
210 ssize_t k;
211 int r;
212
213 assert(f);
214
7560fffc 215 memcpy(h.signature, HEADER_SIGNATURE, 8);
23b0b2b2 216 h.header_size = htole64(ALIGN64(sizeof(h)));
cec736d2 217
d89c8fdf
ZJS
218 h.incompatible_flags |= htole32(
219 f->compress_xz * HEADER_INCOMPATIBLE_COMPRESSED_XZ |
220 f->compress_lz4 * HEADER_INCOMPATIBLE_COMPRESSED_LZ4);
7560fffc 221
d89c8fdf
ZJS
222 h.compatible_flags = htole32(
223 f->seal * HEADER_COMPATIBLE_SEALED);
7560fffc 224
cec736d2
LP
225 r = sd_id128_randomize(&h.file_id);
226 if (r < 0)
227 return r;
228
0ac38b70
LP
229 if (template) {
230 h.seqnum_id = template->header->seqnum_id;
beec0085 231 h.tail_entry_seqnum = template->header->tail_entry_seqnum;
0ac38b70
LP
232 } else
233 h.seqnum_id = h.file_id;
cec736d2
LP
234
235 k = pwrite(f->fd, &h, sizeof(h), 0);
236 if (k < 0)
237 return -errno;
238
239 if (k != sizeof(h))
240 return -EIO;
241
242 return 0;
243}
244
245static int journal_file_refresh_header(JournalFile *f) {
de190aef 246 sd_id128_t boot_id;
fa6ac760 247 int r;
cec736d2
LP
248
249 assert(f);
250
251 r = sd_id128_get_machine(&f->header->machine_id);
252 if (r < 0)
253 return r;
254
de190aef 255 r = sd_id128_get_boot(&boot_id);
cec736d2
LP
256 if (r < 0)
257 return r;
258
de190aef
LP
259 if (sd_id128_equal(boot_id, f->header->boot_id))
260 f->tail_entry_monotonic_valid = true;
261
262 f->header->boot_id = boot_id;
263
fa6ac760 264 r = journal_file_set_online(f);
b788cc23 265
7560fffc 266 /* Sync the online state to disk */
a676e665 267 fsync(f->fd);
b788cc23 268
fa6ac760 269 return r;
cec736d2
LP
270}
271
272static int journal_file_verify_header(JournalFile *f) {
d89c8fdf
ZJS
273 uint32_t flags;
274
cec736d2
LP
275 assert(f);
276
7560fffc 277 if (memcmp(f->header->signature, HEADER_SIGNATURE, 8))
cec736d2
LP
278 return -EBADMSG;
279
7560fffc
LP
280 /* In both read and write mode we refuse to open files with
281 * incompatible flags we don't know */
d89c8fdf
ZJS
282 flags = le32toh(f->header->incompatible_flags);
283 if (flags & ~HEADER_INCOMPATIBLE_SUPPORTED) {
284 if (flags & ~HEADER_INCOMPATIBLE_ANY)
285 log_debug("Journal file %s has unknown incompatible flags %"PRIx32,
286 f->path, flags & ~HEADER_INCOMPATIBLE_ANY);
287 flags = (flags & HEADER_INCOMPATIBLE_ANY) & ~HEADER_INCOMPATIBLE_SUPPORTED;
288 if (flags)
289 log_debug("Journal file %s uses incompatible flags %"PRIx32
290 " disabled at compilation time.", f->path, flags);
cec736d2 291 return -EPROTONOSUPPORT;
d89c8fdf 292 }
cec736d2 293
7560fffc
LP
294 /* When open for writing we refuse to open files with
295 * compatible flags, too */
d89c8fdf
ZJS
296 flags = le32toh(f->header->compatible_flags);
297 if (f->writable && (flags & ~HEADER_COMPATIBLE_SUPPORTED)) {
298 if (flags & ~HEADER_COMPATIBLE_ANY)
299 log_debug("Journal file %s has unknown compatible flags %"PRIx32,
300 f->path, flags & ~HEADER_COMPATIBLE_ANY);
301 flags = (flags & HEADER_COMPATIBLE_ANY) & ~HEADER_COMPATIBLE_SUPPORTED;
302 if (flags)
303 log_debug("Journal file %s uses compatible flags %"PRIx32
304 " disabled at compilation time.", f->path, flags);
305 return -EPROTONOSUPPORT;
7560fffc
LP
306 }
307
db11ac1a
LP
308 if (f->header->state >= _STATE_MAX)
309 return -EBADMSG;
310
dca6219e
LP
311 /* The first addition was n_data, so check that we are at least this large */
312 if (le64toh(f->header->header_size) < HEADER_SIZE_MIN)
23b0b2b2
LP
313 return -EBADMSG;
314
8088cbd3 315 if (JOURNAL_HEADER_SEALED(f->header) && !JOURNAL_HEADER_CONTAINS(f->header, n_entry_arrays))
beec0085
LP
316 return -EBADMSG;
317
db11ac1a
LP
318 if ((le64toh(f->header->header_size) + le64toh(f->header->arena_size)) > (uint64_t) f->last_stat.st_size)
319 return -ENODATA;
320
321 if (le64toh(f->header->tail_object_offset) > (le64toh(f->header->header_size) + le64toh(f->header->arena_size)))
322 return -ENODATA;
323
7762e02b
LP
324 if (!VALID64(le64toh(f->header->data_hash_table_offset)) ||
325 !VALID64(le64toh(f->header->field_hash_table_offset)) ||
326 !VALID64(le64toh(f->header->tail_object_offset)) ||
327 !VALID64(le64toh(f->header->entry_array_offset)))
328 return -ENODATA;
329
cec736d2 330 if (f->writable) {
ccdbaf91 331 uint8_t state;
cec736d2
LP
332 sd_id128_t machine_id;
333 int r;
334
335 r = sd_id128_get_machine(&machine_id);
336 if (r < 0)
337 return r;
338
339 if (!sd_id128_equal(machine_id, f->header->machine_id))
340 return -EHOSTDOWN;
341
de190aef 342 state = f->header->state;
cec736d2 343
71fa6f00
LP
344 if (state == STATE_ONLINE) {
345 log_debug("Journal file %s is already online. Assuming unclean closing.", f->path);
346 return -EBUSY;
347 } else if (state == STATE_ARCHIVED)
cec736d2 348 return -ESHUTDOWN;
71fa6f00 349 else if (state != STATE_OFFLINE) {
8facc349 350 log_debug("Journal file %s has unknown state %i.", f->path, state);
71fa6f00
LP
351 return -EBUSY;
352 }
cec736d2
LP
353 }
354
d89c8fdf
ZJS
355 f->compress_xz = JOURNAL_HEADER_COMPRESSED_XZ(f->header);
356 f->compress_lz4 = JOURNAL_HEADER_COMPRESSED_LZ4(f->header);
c586dbf1 357
f1889c91 358 f->seal = JOURNAL_HEADER_SEALED(f->header);
7560fffc 359
cec736d2
LP
360 return 0;
361}
362
2678031a
LP
363static int journal_file_fstat(JournalFile *f) {
364 assert(f);
365 assert(f->fd >= 0);
366
367 if (fstat(f->fd, &f->last_stat) < 0)
368 return -errno;
369
370 f->last_stat_usec = now(CLOCK_MONOTONIC);
371
372 /* Refuse appending to files that are already deleted */
373 if (f->last_stat.st_nlink <= 0)
374 return -EIDRM;
375
376 return 0;
377}
378
cec736d2 379static int journal_file_allocate(JournalFile *f, uint64_t offset, uint64_t size) {
eda4b58b 380 uint64_t old_size, new_size;
fec2aa2f 381 int r;
cec736d2
LP
382
383 assert(f);
384
cec736d2 385 /* We assume that this file is not sparse, and we know that
38ac38b2 386 * for sure, since we always call posix_fallocate()
cec736d2
LP
387 * ourselves */
388
fa6ac760
LP
389 if (mmap_cache_got_sigbus(f->mmap, f->fd))
390 return -EIO;
391
cec736d2 392 old_size =
23b0b2b2 393 le64toh(f->header->header_size) +
cec736d2
LP
394 le64toh(f->header->arena_size);
395
bc85bfee 396 new_size = PAGE_ALIGN(offset + size);
23b0b2b2
LP
397 if (new_size < le64toh(f->header->header_size))
398 new_size = le64toh(f->header->header_size);
bc85bfee 399
2678031a
LP
400 if (new_size <= old_size) {
401
402 /* We already pre-allocated enough space, but before
403 * we write to it, let's check with fstat() if the
404 * file got deleted, in order make sure we don't throw
405 * away the data immediately. Don't check fstat() for
406 * all writes though, but only once ever 10s. */
407
408 if (f->last_stat_usec + LAST_STAT_REFRESH_USEC > now(CLOCK_MONOTONIC))
409 return 0;
410
411 return journal_file_fstat(f);
412 }
413
414 /* Allocate more space. */
cec736d2 415
a676e665 416 if (f->metrics.max_size > 0 && new_size > f->metrics.max_size)
bc85bfee 417 return -E2BIG;
cec736d2 418
a676e665 419 if (new_size > f->metrics.min_size && f->metrics.keep_free > 0) {
cec736d2
LP
420 struct statvfs svfs;
421
422 if (fstatvfs(f->fd, &svfs) >= 0) {
423 uint64_t available;
424
070052ab 425 available = LESS_BY((uint64_t) svfs.f_bfree * (uint64_t) svfs.f_bsize, f->metrics.keep_free);
cec736d2
LP
426
427 if (new_size - old_size > available)
428 return -E2BIG;
429 }
430 }
431
eda4b58b
LP
432 /* Increase by larger blocks at once */
433 new_size = ((new_size+FILE_SIZE_INCREASE-1) / FILE_SIZE_INCREASE) * FILE_SIZE_INCREASE;
434 if (f->metrics.max_size > 0 && new_size > f->metrics.max_size)
435 new_size = f->metrics.max_size;
436
bc85bfee
LP
437 /* Note that the glibc fallocate() fallback is very
438 inefficient, hence we try to minimize the allocation area
439 as we can. */
fec2aa2f
GV
440 r = posix_fallocate(f->fd, old_size, new_size - old_size);
441 if (r != 0)
442 return -r;
cec736d2 443
23b0b2b2 444 f->header->arena_size = htole64(new_size - le64toh(f->header->header_size));
cec736d2 445
2678031a 446 return journal_file_fstat(f);
cec736d2
LP
447}
448
78519831 449static unsigned type_to_context(ObjectType type) {
d3d3208f 450 /* One context for each type, plus one catch-all for the rest */
69adae51 451 assert_cc(_OBJECT_TYPE_MAX <= MMAP_CACHE_MAX_CONTEXTS);
fa6ac760 452 assert_cc(CONTEXT_HEADER < MMAP_CACHE_MAX_CONTEXTS);
d05089d8 453 return type > OBJECT_UNUSED && type < _OBJECT_TYPE_MAX ? type : 0;
d3d3208f
MS
454}
455
7a9dabea 456static int journal_file_move_to(JournalFile *f, ObjectType type, bool keep_always, uint64_t offset, uint64_t size, void **ret) {
2678031a
LP
457 int r;
458
cec736d2 459 assert(f);
cec736d2
LP
460 assert(ret);
461
7762e02b
LP
462 if (size <= 0)
463 return -EINVAL;
464
2a59ea54 465 /* Avoid SIGBUS on invalid accesses */
4bbdcdb3
LP
466 if (offset + size > (uint64_t) f->last_stat.st_size) {
467 /* Hmm, out of range? Let's refresh the fstat() data
468 * first, before we trust that check. */
469
2678031a
LP
470 r = journal_file_fstat(f);
471 if (r < 0)
472 return r;
473
474 if (offset + size > (uint64_t) f->last_stat.st_size)
4bbdcdb3
LP
475 return -EADDRNOTAVAIL;
476 }
477
7a9dabea 478 return mmap_cache_get(f->mmap, f->fd, f->prot, type_to_context(type), keep_always, offset, size, &f->last_stat, ret);
cec736d2
LP
479}
480
16e9f408
LP
481static uint64_t minimum_header_size(Object *o) {
482
b8e891e6 483 static const uint64_t table[] = {
16e9f408
LP
484 [OBJECT_DATA] = sizeof(DataObject),
485 [OBJECT_FIELD] = sizeof(FieldObject),
486 [OBJECT_ENTRY] = sizeof(EntryObject),
487 [OBJECT_DATA_HASH_TABLE] = sizeof(HashTableObject),
488 [OBJECT_FIELD_HASH_TABLE] = sizeof(HashTableObject),
489 [OBJECT_ENTRY_ARRAY] = sizeof(EntryArrayObject),
490 [OBJECT_TAG] = sizeof(TagObject),
491 };
492
493 if (o->object.type >= ELEMENTSOF(table) || table[o->object.type] <= 0)
494 return sizeof(ObjectHeader);
495
496 return table[o->object.type];
497}
498
78519831 499int journal_file_move_to_object(JournalFile *f, ObjectType type, uint64_t offset, Object **ret) {
cec736d2
LP
500 int r;
501 void *t;
502 Object *o;
503 uint64_t s;
504
505 assert(f);
506 assert(ret);
507
db11ac1a
LP
508 /* Objects may only be located at multiple of 64 bit */
509 if (!VALID64(offset))
510 return -EFAULT;
511
7a9dabea 512 r = journal_file_move_to(f, type, false, offset, sizeof(ObjectHeader), &t);
cec736d2
LP
513 if (r < 0)
514 return r;
515
516 o = (Object*) t;
517 s = le64toh(o->object.size);
518
519 if (s < sizeof(ObjectHeader))
520 return -EBADMSG;
521
16e9f408
LP
522 if (o->object.type <= OBJECT_UNUSED)
523 return -EBADMSG;
524
525 if (s < minimum_header_size(o))
526 return -EBADMSG;
527
d05089d8 528 if (type > OBJECT_UNUSED && o->object.type != type)
cec736d2
LP
529 return -EBADMSG;
530
531 if (s > sizeof(ObjectHeader)) {
7a9dabea 532 r = journal_file_move_to(f, type, false, offset, s, &t);
cec736d2
LP
533 if (r < 0)
534 return r;
535
536 o = (Object*) t;
537 }
538
cec736d2
LP
539 *ret = o;
540 return 0;
541}
542
d98cc1f2 543static uint64_t journal_file_entry_seqnum(JournalFile *f, uint64_t *seqnum) {
cec736d2
LP
544 uint64_t r;
545
546 assert(f);
547
beec0085 548 r = le64toh(f->header->tail_entry_seqnum) + 1;
c2373f84
LP
549
550 if (seqnum) {
de190aef 551 /* If an external seqnum counter was passed, we update
c2373f84
LP
552 * both the local and the external one, and set it to
553 * the maximum of both */
554
555 if (*seqnum + 1 > r)
556 r = *seqnum + 1;
557
558 *seqnum = r;
559 }
560
beec0085 561 f->header->tail_entry_seqnum = htole64(r);
cec736d2 562
beec0085
LP
563 if (f->header->head_entry_seqnum == 0)
564 f->header->head_entry_seqnum = htole64(r);
de190aef 565
cec736d2
LP
566 return r;
567}
568
78519831 569int journal_file_append_object(JournalFile *f, ObjectType type, uint64_t size, Object **ret, uint64_t *offset) {
cec736d2
LP
570 int r;
571 uint64_t p;
572 Object *tail, *o;
573 void *t;
574
575 assert(f);
d05089d8 576 assert(type > OBJECT_UNUSED && type < _OBJECT_TYPE_MAX);
cec736d2
LP
577 assert(size >= sizeof(ObjectHeader));
578 assert(offset);
579 assert(ret);
580
26687bf8
OS
581 r = journal_file_set_online(f);
582 if (r < 0)
583 return r;
584
cec736d2 585 p = le64toh(f->header->tail_object_offset);
cec736d2 586 if (p == 0)
23b0b2b2 587 p = le64toh(f->header->header_size);
cec736d2 588 else {
d05089d8 589 r = journal_file_move_to_object(f, OBJECT_UNUSED, p, &tail);
cec736d2
LP
590 if (r < 0)
591 return r;
592
593 p += ALIGN64(le64toh(tail->object.size));
594 }
595
596 r = journal_file_allocate(f, p, size);
597 if (r < 0)
598 return r;
599
fcde2389 600 r = journal_file_move_to(f, type, false, p, size, &t);
cec736d2
LP
601 if (r < 0)
602 return r;
603
604 o = (Object*) t;
605
606 zero(o->object);
de190aef 607 o->object.type = type;
cec736d2
LP
608 o->object.size = htole64(size);
609
610 f->header->tail_object_offset = htole64(p);
cec736d2
LP
611 f->header->n_objects = htole64(le64toh(f->header->n_objects) + 1);
612
613 *ret = o;
614 *offset = p;
615
616 return 0;
617}
618
de190aef 619static int journal_file_setup_data_hash_table(JournalFile *f) {
cec736d2
LP
620 uint64_t s, p;
621 Object *o;
622 int r;
623
624 assert(f);
625
070052ab
LP
626 /* We estimate that we need 1 hash table entry per 768 bytes
627 of journal file and we want to make sure we never get
628 beyond 75% fill level. Calculate the hash table size for
629 the maximum file size based on these metrics. */
4a92baf3 630
dfabe643 631 s = (f->metrics.max_size * 4 / 768 / 3) * sizeof(HashItem);
4a92baf3
LP
632 if (s < DEFAULT_DATA_HASH_TABLE_SIZE)
633 s = DEFAULT_DATA_HASH_TABLE_SIZE;
634
507f22bd 635 log_debug("Reserving %"PRIu64" entries in hash table.", s / sizeof(HashItem));
4a92baf3 636
de190aef
LP
637 r = journal_file_append_object(f,
638 OBJECT_DATA_HASH_TABLE,
639 offsetof(Object, hash_table.items) + s,
640 &o, &p);
cec736d2
LP
641 if (r < 0)
642 return r;
643
29804cc1 644 memzero(o->hash_table.items, s);
cec736d2 645
de190aef
LP
646 f->header->data_hash_table_offset = htole64(p + offsetof(Object, hash_table.items));
647 f->header->data_hash_table_size = htole64(s);
cec736d2
LP
648
649 return 0;
650}
651
de190aef 652static int journal_file_setup_field_hash_table(JournalFile *f) {
cec736d2
LP
653 uint64_t s, p;
654 Object *o;
655 int r;
656
657 assert(f);
658
3c1668da
LP
659 /* We use a fixed size hash table for the fields as this
660 * number should grow very slowly only */
661
de190aef
LP
662 s = DEFAULT_FIELD_HASH_TABLE_SIZE;
663 r = journal_file_append_object(f,
664 OBJECT_FIELD_HASH_TABLE,
665 offsetof(Object, hash_table.items) + s,
666 &o, &p);
cec736d2
LP
667 if (r < 0)
668 return r;
669
29804cc1 670 memzero(o->hash_table.items, s);
cec736d2 671
de190aef
LP
672 f->header->field_hash_table_offset = htole64(p + offsetof(Object, hash_table.items));
673 f->header->field_hash_table_size = htole64(s);
cec736d2
LP
674
675 return 0;
676}
677
dade37d4 678int journal_file_map_data_hash_table(JournalFile *f) {
cec736d2
LP
679 uint64_t s, p;
680 void *t;
681 int r;
682
683 assert(f);
684
dade37d4
LP
685 if (f->data_hash_table)
686 return 0;
687
de190aef
LP
688 p = le64toh(f->header->data_hash_table_offset);
689 s = le64toh(f->header->data_hash_table_size);
cec736d2 690
de190aef 691 r = journal_file_move_to(f,
16e9f408 692 OBJECT_DATA_HASH_TABLE,
fcde2389 693 true,
de190aef
LP
694 p, s,
695 &t);
cec736d2
LP
696 if (r < 0)
697 return r;
698
de190aef 699 f->data_hash_table = t;
cec736d2
LP
700 return 0;
701}
702
dade37d4 703int journal_file_map_field_hash_table(JournalFile *f) {
cec736d2
LP
704 uint64_t s, p;
705 void *t;
706 int r;
707
708 assert(f);
709
dade37d4
LP
710 if (f->field_hash_table)
711 return 0;
712
de190aef
LP
713 p = le64toh(f->header->field_hash_table_offset);
714 s = le64toh(f->header->field_hash_table_size);
cec736d2 715
de190aef 716 r = journal_file_move_to(f,
16e9f408 717 OBJECT_FIELD_HASH_TABLE,
fcde2389 718 true,
de190aef
LP
719 p, s,
720 &t);
cec736d2
LP
721 if (r < 0)
722 return r;
723
de190aef 724 f->field_hash_table = t;
cec736d2
LP
725 return 0;
726}
727
3c1668da
LP
728static int journal_file_link_field(
729 JournalFile *f,
730 Object *o,
731 uint64_t offset,
732 uint64_t hash) {
733
805d1486 734 uint64_t p, h, m;
3c1668da
LP
735 int r;
736
737 assert(f);
738 assert(o);
739 assert(offset > 0);
740
741 if (o->object.type != OBJECT_FIELD)
742 return -EINVAL;
743
805d1486
LP
744 m = le64toh(f->header->field_hash_table_size) / sizeof(HashItem);
745 if (m <= 0)
746 return -EBADMSG;
3c1668da 747
805d1486 748 /* This might alter the window we are looking at */
3c1668da
LP
749 o->field.next_hash_offset = o->field.head_data_offset = 0;
750
805d1486 751 h = hash % m;
3c1668da
LP
752 p = le64toh(f->field_hash_table[h].tail_hash_offset);
753 if (p == 0)
754 f->field_hash_table[h].head_hash_offset = htole64(offset);
755 else {
756 r = journal_file_move_to_object(f, OBJECT_FIELD, p, &o);
757 if (r < 0)
758 return r;
759
760 o->field.next_hash_offset = htole64(offset);
761 }
762
763 f->field_hash_table[h].tail_hash_offset = htole64(offset);
764
765 if (JOURNAL_HEADER_CONTAINS(f->header, n_fields))
766 f->header->n_fields = htole64(le64toh(f->header->n_fields) + 1);
767
768 return 0;
769}
770
771static int journal_file_link_data(
772 JournalFile *f,
773 Object *o,
774 uint64_t offset,
775 uint64_t hash) {
776
805d1486 777 uint64_t p, h, m;
cec736d2
LP
778 int r;
779
780 assert(f);
781 assert(o);
782 assert(offset > 0);
b588975f
LP
783
784 if (o->object.type != OBJECT_DATA)
785 return -EINVAL;
cec736d2 786
805d1486
LP
787 m = le64toh(f->header->data_hash_table_size) / sizeof(HashItem);
788 if (m <= 0)
789 return -EBADMSG;
48496df6 790
805d1486 791 /* This might alter the window we are looking at */
de190aef
LP
792 o->data.next_hash_offset = o->data.next_field_offset = 0;
793 o->data.entry_offset = o->data.entry_array_offset = 0;
794 o->data.n_entries = 0;
cec736d2 795
805d1486 796 h = hash % m;
8db4213e 797 p = le64toh(f->data_hash_table[h].tail_hash_offset);
3c1668da 798 if (p == 0)
cec736d2 799 /* Only entry in the hash table is easy */
de190aef 800 f->data_hash_table[h].head_hash_offset = htole64(offset);
3c1668da 801 else {
48496df6
LP
802 /* Move back to the previous data object, to patch in
803 * pointer */
cec736d2 804
de190aef 805 r = journal_file_move_to_object(f, OBJECT_DATA, p, &o);
cec736d2
LP
806 if (r < 0)
807 return r;
808
de190aef 809 o->data.next_hash_offset = htole64(offset);
cec736d2
LP
810 }
811
de190aef 812 f->data_hash_table[h].tail_hash_offset = htole64(offset);
cec736d2 813
dca6219e
LP
814 if (JOURNAL_HEADER_CONTAINS(f->header, n_data))
815 f->header->n_data = htole64(le64toh(f->header->n_data) + 1);
816
cec736d2
LP
817 return 0;
818}
819
3c1668da
LP
820int journal_file_find_field_object_with_hash(
821 JournalFile *f,
822 const void *field, uint64_t size, uint64_t hash,
823 Object **ret, uint64_t *offset) {
824
805d1486 825 uint64_t p, osize, h, m;
3c1668da
LP
826 int r;
827
828 assert(f);
829 assert(field && size > 0);
830
dade37d4
LP
831 /* If the field hash table is empty, we can't find anything */
832 if (le64toh(f->header->field_hash_table_size) <= 0)
833 return 0;
834
835 /* Map the field hash table, if it isn't mapped yet. */
836 r = journal_file_map_field_hash_table(f);
837 if (r < 0)
838 return r;
839
3c1668da
LP
840 osize = offsetof(Object, field.payload) + size;
841
805d1486 842 m = le64toh(f->header->field_hash_table_size) / sizeof(HashItem);
805d1486 843 if (m <= 0)
3c1668da
LP
844 return -EBADMSG;
845
805d1486 846 h = hash % m;
3c1668da
LP
847 p = le64toh(f->field_hash_table[h].head_hash_offset);
848
849 while (p > 0) {
850 Object *o;
851
852 r = journal_file_move_to_object(f, OBJECT_FIELD, p, &o);
853 if (r < 0)
854 return r;
855
856 if (le64toh(o->field.hash) == hash &&
857 le64toh(o->object.size) == osize &&
858 memcmp(o->field.payload, field, size) == 0) {
859
860 if (ret)
861 *ret = o;
862 if (offset)
863 *offset = p;
864
865 return 1;
866 }
867
868 p = le64toh(o->field.next_hash_offset);
869 }
870
871 return 0;
872}
873
874int journal_file_find_field_object(
875 JournalFile *f,
876 const void *field, uint64_t size,
877 Object **ret, uint64_t *offset) {
878
879 uint64_t hash;
880
881 assert(f);
882 assert(field && size > 0);
883
884 hash = hash64(field, size);
885
886 return journal_file_find_field_object_with_hash(f,
887 field, size, hash,
888 ret, offset);
889}
890
de190aef
LP
891int journal_file_find_data_object_with_hash(
892 JournalFile *f,
893 const void *data, uint64_t size, uint64_t hash,
894 Object **ret, uint64_t *offset) {
48496df6 895
805d1486 896 uint64_t p, osize, h, m;
cec736d2
LP
897 int r;
898
899 assert(f);
900 assert(data || size == 0);
901
dade37d4
LP
902 /* If there's no data hash table, then there's no entry. */
903 if (le64toh(f->header->data_hash_table_size) <= 0)
904 return 0;
905
906 /* Map the data hash table, if it isn't mapped yet. */
907 r = journal_file_map_data_hash_table(f);
908 if (r < 0)
909 return r;
910
cec736d2
LP
911 osize = offsetof(Object, data.payload) + size;
912
805d1486
LP
913 m = le64toh(f->header->data_hash_table_size) / sizeof(HashItem);
914 if (m <= 0)
bc85bfee
LP
915 return -EBADMSG;
916
805d1486 917 h = hash % m;
de190aef 918 p = le64toh(f->data_hash_table[h].head_hash_offset);
cec736d2 919
de190aef
LP
920 while (p > 0) {
921 Object *o;
cec736d2 922
de190aef 923 r = journal_file_move_to_object(f, OBJECT_DATA, p, &o);
cec736d2
LP
924 if (r < 0)
925 return r;
926
807e17f0 927 if (le64toh(o->data.hash) != hash)
85a131e8 928 goto next;
807e17f0 929
d89c8fdf 930 if (o->object.flags & OBJECT_COMPRESSION_MASK) {
3b1a55e1 931#if defined(HAVE_XZ) || defined(HAVE_LZ4)
fa1c4b51 932 uint64_t l;
a7f7d1bd 933 size_t rsize = 0;
cec736d2 934
807e17f0
LP
935 l = le64toh(o->object.size);
936 if (l <= offsetof(Object, data.payload))
cec736d2
LP
937 return -EBADMSG;
938
807e17f0
LP
939 l -= offsetof(Object, data.payload);
940
d89c8fdf
ZJS
941 r = decompress_blob(o->object.flags & OBJECT_COMPRESSION_MASK,
942 o->data.payload, l, &f->compress_buffer, &f->compress_buffer_size, &rsize, 0);
943 if (r < 0)
944 return r;
807e17f0 945
b785c858 946 if (rsize == size &&
807e17f0
LP
947 memcmp(f->compress_buffer, data, size) == 0) {
948
949 if (ret)
950 *ret = o;
951
952 if (offset)
953 *offset = p;
954
955 return 1;
956 }
3b1a55e1
ZJS
957#else
958 return -EPROTONOSUPPORT;
959#endif
807e17f0
LP
960 } else if (le64toh(o->object.size) == osize &&
961 memcmp(o->data.payload, data, size) == 0) {
962
cec736d2
LP
963 if (ret)
964 *ret = o;
965
966 if (offset)
967 *offset = p;
968
de190aef 969 return 1;
cec736d2
LP
970 }
971
85a131e8 972 next:
cec736d2
LP
973 p = le64toh(o->data.next_hash_offset);
974 }
975
de190aef
LP
976 return 0;
977}
978
979int journal_file_find_data_object(
980 JournalFile *f,
981 const void *data, uint64_t size,
982 Object **ret, uint64_t *offset) {
983
984 uint64_t hash;
985
986 assert(f);
987 assert(data || size == 0);
988
989 hash = hash64(data, size);
990
991 return journal_file_find_data_object_with_hash(f,
992 data, size, hash,
993 ret, offset);
994}
995
3c1668da
LP
996static int journal_file_append_field(
997 JournalFile *f,
998 const void *field, uint64_t size,
999 Object **ret, uint64_t *offset) {
1000
1001 uint64_t hash, p;
1002 uint64_t osize;
1003 Object *o;
1004 int r;
1005
1006 assert(f);
1007 assert(field && size > 0);
1008
1009 hash = hash64(field, size);
1010
1011 r = journal_file_find_field_object_with_hash(f, field, size, hash, &o, &p);
1012 if (r < 0)
1013 return r;
1014 else if (r > 0) {
1015
1016 if (ret)
1017 *ret = o;
1018
1019 if (offset)
1020 *offset = p;
1021
1022 return 0;
1023 }
1024
1025 osize = offsetof(Object, field.payload) + size;
1026 r = journal_file_append_object(f, OBJECT_FIELD, osize, &o, &p);
8c92d4bb
LP
1027 if (r < 0)
1028 return r;
3c1668da
LP
1029
1030 o->field.hash = htole64(hash);
1031 memcpy(o->field.payload, field, size);
1032
1033 r = journal_file_link_field(f, o, p, hash);
1034 if (r < 0)
1035 return r;
1036
1037 /* The linking might have altered the window, so let's
1038 * refresh our pointer */
1039 r = journal_file_move_to_object(f, OBJECT_FIELD, p, &o);
1040 if (r < 0)
1041 return r;
1042
1043#ifdef HAVE_GCRYPT
1044 r = journal_file_hmac_put_object(f, OBJECT_FIELD, o, p);
1045 if (r < 0)
1046 return r;
1047#endif
1048
1049 if (ret)
1050 *ret = o;
1051
1052 if (offset)
1053 *offset = p;
1054
1055 return 0;
1056}
1057
48496df6
LP
1058static int journal_file_append_data(
1059 JournalFile *f,
1060 const void *data, uint64_t size,
1061 Object **ret, uint64_t *offset) {
1062
de190aef
LP
1063 uint64_t hash, p;
1064 uint64_t osize;
1065 Object *o;
d89c8fdf 1066 int r, compression = 0;
3c1668da 1067 const void *eq;
de190aef
LP
1068
1069 assert(f);
1070 assert(data || size == 0);
1071
1072 hash = hash64(data, size);
1073
1074 r = journal_file_find_data_object_with_hash(f, data, size, hash, &o, &p);
1075 if (r < 0)
1076 return r;
0240c603 1077 if (r > 0) {
de190aef
LP
1078
1079 if (ret)
1080 *ret = o;
1081
1082 if (offset)
1083 *offset = p;
1084
1085 return 0;
1086 }
1087
1088 osize = offsetof(Object, data.payload) + size;
1089 r = journal_file_append_object(f, OBJECT_DATA, osize, &o, &p);
cec736d2
LP
1090 if (r < 0)
1091 return r;
1092
cec736d2 1093 o->data.hash = htole64(hash);
807e17f0 1094
d89c8fdf 1095#if defined(HAVE_XZ) || defined(HAVE_LZ4)
d1afbcd2 1096 if (JOURNAL_FILE_COMPRESS(f) && size >= COMPRESSION_SIZE_THRESHOLD) {
a7f7d1bd 1097 size_t rsize = 0;
807e17f0 1098
5d6f46b6 1099 compression = compress_blob(data, size, o->data.payload, size - 1, &rsize);
807e17f0 1100
d1afbcd2 1101 if (compression >= 0) {
807e17f0 1102 o->object.size = htole64(offsetof(Object, data.payload) + rsize);
d89c8fdf 1103 o->object.flags |= compression;
807e17f0 1104
fa1c4b51 1105 log_debug("Compressed data object %"PRIu64" -> %zu using %s",
d89c8fdf 1106 size, rsize, object_compressed_to_string(compression));
d1afbcd2
LP
1107 } else
1108 /* Compression didn't work, we don't really care why, let's continue without compression */
1109 compression = 0;
807e17f0
LP
1110 }
1111#endif
1112
d1afbcd2 1113 if (compression == 0 && size > 0)
807e17f0 1114 memcpy(o->data.payload, data, size);
cec736d2 1115
de190aef 1116 r = journal_file_link_data(f, o, p, hash);
cec736d2
LP
1117 if (r < 0)
1118 return r;
1119
48496df6
LP
1120 /* The linking might have altered the window, so let's
1121 * refresh our pointer */
1122 r = journal_file_move_to_object(f, OBJECT_DATA, p, &o);
1123 if (r < 0)
1124 return r;
1125
08c6f819
SL
1126 if (!data)
1127 eq = NULL;
1128 else
1129 eq = memchr(data, '=', size);
3c1668da 1130 if (eq && eq > data) {
748db592 1131 Object *fo = NULL;
3c1668da 1132 uint64_t fp;
3c1668da
LP
1133
1134 /* Create field object ... */
1135 r = journal_file_append_field(f, data, (uint8_t*) eq - (uint8_t*) data, &fo, &fp);
1136 if (r < 0)
1137 return r;
1138
1139 /* ... and link it in. */
1140 o->data.next_field_offset = fo->field.head_data_offset;
1141 fo->field.head_data_offset = le64toh(p);
1142 }
1143
5996c7c2
LP
1144#ifdef HAVE_GCRYPT
1145 r = journal_file_hmac_put_object(f, OBJECT_DATA, o, p);
1146 if (r < 0)
1147 return r;
1148#endif
1149
cec736d2
LP
1150 if (ret)
1151 *ret = o;
1152
1153 if (offset)
de190aef 1154 *offset = p;
cec736d2
LP
1155
1156 return 0;
1157}
1158
1159uint64_t journal_file_entry_n_items(Object *o) {
1160 assert(o);
b588975f
LP
1161
1162 if (o->object.type != OBJECT_ENTRY)
1163 return 0;
cec736d2
LP
1164
1165 return (le64toh(o->object.size) - offsetof(Object, entry.items)) / sizeof(EntryItem);
1166}
1167
0284adc6 1168uint64_t journal_file_entry_array_n_items(Object *o) {
de190aef 1169 assert(o);
b588975f
LP
1170
1171 if (o->object.type != OBJECT_ENTRY_ARRAY)
1172 return 0;
de190aef
LP
1173
1174 return (le64toh(o->object.size) - offsetof(Object, entry_array.items)) / sizeof(uint64_t);
1175}
1176
fb9a24b6
LP
1177uint64_t journal_file_hash_table_n_items(Object *o) {
1178 assert(o);
b588975f
LP
1179
1180 if (o->object.type != OBJECT_DATA_HASH_TABLE &&
1181 o->object.type != OBJECT_FIELD_HASH_TABLE)
1182 return 0;
fb9a24b6
LP
1183
1184 return (le64toh(o->object.size) - offsetof(Object, hash_table.items)) / sizeof(HashItem);
1185}
1186
de190aef 1187static int link_entry_into_array(JournalFile *f,
4fd052ae
FC
1188 le64_t *first,
1189 le64_t *idx,
de190aef 1190 uint64_t p) {
cec736d2 1191 int r;
de190aef
LP
1192 uint64_t n = 0, ap = 0, q, i, a, hidx;
1193 Object *o;
1194
cec736d2 1195 assert(f);
de190aef
LP
1196 assert(first);
1197 assert(idx);
1198 assert(p > 0);
cec736d2 1199
de190aef
LP
1200 a = le64toh(*first);
1201 i = hidx = le64toh(*idx);
1202 while (a > 0) {
1203
1204 r = journal_file_move_to_object(f, OBJECT_ENTRY_ARRAY, a, &o);
1205 if (r < 0)
1206 return r;
cec736d2 1207
de190aef
LP
1208 n = journal_file_entry_array_n_items(o);
1209 if (i < n) {
1210 o->entry_array.items[i] = htole64(p);
1211 *idx = htole64(hidx + 1);
1212 return 0;
1213 }
cec736d2 1214
de190aef
LP
1215 i -= n;
1216 ap = a;
1217 a = le64toh(o->entry_array.next_entry_array_offset);
1218 }
1219
1220 if (hidx > n)
1221 n = (hidx+1) * 2;
1222 else
1223 n = n * 2;
1224
1225 if (n < 4)
1226 n = 4;
1227
1228 r = journal_file_append_object(f, OBJECT_ENTRY_ARRAY,
1229 offsetof(Object, entry_array.items) + n * sizeof(uint64_t),
1230 &o, &q);
cec736d2
LP
1231 if (r < 0)
1232 return r;
1233
feb12d3e 1234#ifdef HAVE_GCRYPT
5996c7c2 1235 r = journal_file_hmac_put_object(f, OBJECT_ENTRY_ARRAY, o, q);
b0af6f41
LP
1236 if (r < 0)
1237 return r;
feb12d3e 1238#endif
b0af6f41 1239
de190aef 1240 o->entry_array.items[i] = htole64(p);
cec736d2 1241
de190aef 1242 if (ap == 0)
7be3aa17 1243 *first = htole64(q);
cec736d2 1244 else {
de190aef 1245 r = journal_file_move_to_object(f, OBJECT_ENTRY_ARRAY, ap, &o);
cec736d2
LP
1246 if (r < 0)
1247 return r;
1248
de190aef
LP
1249 o->entry_array.next_entry_array_offset = htole64(q);
1250 }
cec736d2 1251
2dee23eb
LP
1252 if (JOURNAL_HEADER_CONTAINS(f->header, n_entry_arrays))
1253 f->header->n_entry_arrays = htole64(le64toh(f->header->n_entry_arrays) + 1);
1254
de190aef
LP
1255 *idx = htole64(hidx + 1);
1256
1257 return 0;
1258}
cec736d2 1259
de190aef 1260static int link_entry_into_array_plus_one(JournalFile *f,
4fd052ae
FC
1261 le64_t *extra,
1262 le64_t *first,
1263 le64_t *idx,
de190aef
LP
1264 uint64_t p) {
1265
1266 int r;
1267
1268 assert(f);
1269 assert(extra);
1270 assert(first);
1271 assert(idx);
1272 assert(p > 0);
1273
1274 if (*idx == 0)
1275 *extra = htole64(p);
1276 else {
4fd052ae 1277 le64_t i;
de190aef 1278
7be3aa17 1279 i = htole64(le64toh(*idx) - 1);
de190aef
LP
1280 r = link_entry_into_array(f, first, &i, p);
1281 if (r < 0)
1282 return r;
cec736d2
LP
1283 }
1284
de190aef
LP
1285 *idx = htole64(le64toh(*idx) + 1);
1286 return 0;
1287}
1288
1289static int journal_file_link_entry_item(JournalFile *f, Object *o, uint64_t offset, uint64_t i) {
1290 uint64_t p;
1291 int r;
1292 assert(f);
1293 assert(o);
1294 assert(offset > 0);
1295
1296 p = le64toh(o->entry.items[i].object_offset);
1297 if (p == 0)
1298 return -EINVAL;
1299
1300 r = journal_file_move_to_object(f, OBJECT_DATA, p, &o);
cec736d2
LP
1301 if (r < 0)
1302 return r;
1303
de190aef
LP
1304 return link_entry_into_array_plus_one(f,
1305 &o->data.entry_offset,
1306 &o->data.entry_array_offset,
1307 &o->data.n_entries,
1308 offset);
cec736d2
LP
1309}
1310
1311static int journal_file_link_entry(JournalFile *f, Object *o, uint64_t offset) {
de190aef 1312 uint64_t n, i;
cec736d2
LP
1313 int r;
1314
1315 assert(f);
1316 assert(o);
1317 assert(offset > 0);
b588975f
LP
1318
1319 if (o->object.type != OBJECT_ENTRY)
1320 return -EINVAL;
cec736d2 1321
b788cc23
LP
1322 __sync_synchronize();
1323
cec736d2 1324 /* Link up the entry itself */
de190aef
LP
1325 r = link_entry_into_array(f,
1326 &f->header->entry_array_offset,
1327 &f->header->n_entries,
1328 offset);
1329 if (r < 0)
1330 return r;
cec736d2 1331
507f22bd 1332 /* log_debug("=> %s seqnr=%"PRIu64" n_entries=%"PRIu64, f->path, o->entry.seqnum, f->header->n_entries); */
cec736d2 1333
de190aef 1334 if (f->header->head_entry_realtime == 0)
0ac38b70 1335 f->header->head_entry_realtime = o->entry.realtime;
cec736d2 1336
0ac38b70 1337 f->header->tail_entry_realtime = o->entry.realtime;
de190aef
LP
1338 f->header->tail_entry_monotonic = o->entry.monotonic;
1339
1340 f->tail_entry_monotonic_valid = true;
cec736d2
LP
1341
1342 /* Link up the items */
1343 n = journal_file_entry_n_items(o);
1344 for (i = 0; i < n; i++) {
1345 r = journal_file_link_entry_item(f, o, offset, i);
1346 if (r < 0)
1347 return r;
1348 }
1349
cec736d2
LP
1350 return 0;
1351}
1352
1353static int journal_file_append_entry_internal(
1354 JournalFile *f,
1355 const dual_timestamp *ts,
1356 uint64_t xor_hash,
1357 const EntryItem items[], unsigned n_items,
de190aef 1358 uint64_t *seqnum,
cec736d2
LP
1359 Object **ret, uint64_t *offset) {
1360 uint64_t np;
1361 uint64_t osize;
1362 Object *o;
1363 int r;
1364
1365 assert(f);
1366 assert(items || n_items == 0);
de190aef 1367 assert(ts);
cec736d2
LP
1368
1369 osize = offsetof(Object, entry.items) + (n_items * sizeof(EntryItem));
1370
de190aef 1371 r = journal_file_append_object(f, OBJECT_ENTRY, osize, &o, &np);
cec736d2
LP
1372 if (r < 0)
1373 return r;
1374
d98cc1f2 1375 o->entry.seqnum = htole64(journal_file_entry_seqnum(f, seqnum));
cec736d2 1376 memcpy(o->entry.items, items, n_items * sizeof(EntryItem));
de190aef
LP
1377 o->entry.realtime = htole64(ts->realtime);
1378 o->entry.monotonic = htole64(ts->monotonic);
cec736d2
LP
1379 o->entry.xor_hash = htole64(xor_hash);
1380 o->entry.boot_id = f->header->boot_id;
1381
feb12d3e 1382#ifdef HAVE_GCRYPT
5996c7c2 1383 r = journal_file_hmac_put_object(f, OBJECT_ENTRY, o, np);
b0af6f41
LP
1384 if (r < 0)
1385 return r;
feb12d3e 1386#endif
b0af6f41 1387
cec736d2
LP
1388 r = journal_file_link_entry(f, o, np);
1389 if (r < 0)
1390 return r;
1391
1392 if (ret)
1393 *ret = o;
1394
1395 if (offset)
1396 *offset = np;
1397
1398 return 0;
1399}
1400
cf244689 1401void journal_file_post_change(JournalFile *f) {
50f20cfd
LP
1402 assert(f);
1403
1404 /* inotify() does not receive IN_MODIFY events from file
1405 * accesses done via mmap(). After each access we hence
1406 * trigger IN_MODIFY by truncating the journal file to its
1407 * current size which triggers IN_MODIFY. */
1408
bc85bfee
LP
1409 __sync_synchronize();
1410
50f20cfd 1411 if (ftruncate(f->fd, f->last_stat.st_size) < 0)
e167d7fd 1412 log_debug_errno(errno, "Failed to truncate file to its own size: %m");
50f20cfd
LP
1413}
1414
7a24f3bf
VC
1415static int post_change_thunk(sd_event_source *timer, uint64_t usec, void *userdata) {
1416 assert(userdata);
1417
1418 journal_file_post_change(userdata);
1419
1420 return 1;
1421}
1422
1423static void schedule_post_change(JournalFile *f) {
1424 sd_event_source *timer;
1425 int enabled, r;
1426 uint64_t now;
1427
1428 assert(f);
1429 assert(f->post_change_timer);
1430
1431 timer = f->post_change_timer;
1432
1433 r = sd_event_source_get_enabled(timer, &enabled);
1434 if (r < 0) {
e167d7fd
LP
1435 log_debug_errno(r, "Failed to get ftruncate timer state: %m");
1436 goto fail;
7a24f3bf
VC
1437 }
1438
1439 if (enabled == SD_EVENT_ONESHOT)
1440 return;
1441
1442 r = sd_event_now(sd_event_source_get_event(timer), CLOCK_MONOTONIC, &now);
1443 if (r < 0) {
e167d7fd
LP
1444 log_debug_errno(r, "Failed to get clock's now for scheduling ftruncate: %m");
1445 goto fail;
7a24f3bf
VC
1446 }
1447
1448 r = sd_event_source_set_time(timer, now+f->post_change_timer_period);
1449 if (r < 0) {
e167d7fd
LP
1450 log_debug_errno(r, "Failed to set time for scheduling ftruncate: %m");
1451 goto fail;
7a24f3bf
VC
1452 }
1453
1454 r = sd_event_source_set_enabled(timer, SD_EVENT_ONESHOT);
1455 if (r < 0) {
e167d7fd
LP
1456 log_debug_errno(r, "Failed to enable scheduled ftruncate: %m");
1457 goto fail;
7a24f3bf 1458 }
e167d7fd
LP
1459
1460 return;
1461
1462fail:
1463 /* On failure, let's simply post the change immediately. */
1464 journal_file_post_change(f);
7a24f3bf
VC
1465}
1466
1467/* Enable coalesced change posting in a timer on the provided sd_event instance */
1468int journal_file_enable_post_change_timer(JournalFile *f, sd_event *e, usec_t t) {
1469 _cleanup_(sd_event_source_unrefp) sd_event_source *timer = NULL;
1470 int r;
1471
1472 assert(f);
1473 assert_return(!f->post_change_timer, -EINVAL);
1474 assert(e);
1475 assert(t);
1476
1477 r = sd_event_add_time(e, &timer, CLOCK_MONOTONIC, 0, 0, post_change_thunk, f);
1478 if (r < 0)
1479 return r;
1480
1481 r = sd_event_source_set_enabled(timer, SD_EVENT_OFF);
1482 if (r < 0)
1483 return r;
1484
1485 f->post_change_timer = timer;
1486 timer = NULL;
1487 f->post_change_timer_period = t;
1488
1489 return r;
1490}
1491
1f2da9ec
LP
1492static int entry_item_cmp(const void *_a, const void *_b) {
1493 const EntryItem *a = _a, *b = _b;
1494
1495 if (le64toh(a->object_offset) < le64toh(b->object_offset))
1496 return -1;
1497 if (le64toh(a->object_offset) > le64toh(b->object_offset))
1498 return 1;
1499 return 0;
1500}
1501
de190aef 1502int journal_file_append_entry(JournalFile *f, const dual_timestamp *ts, const struct iovec iovec[], unsigned n_iovec, uint64_t *seqnum, Object **ret, uint64_t *offset) {
cec736d2
LP
1503 unsigned i;
1504 EntryItem *items;
1505 int r;
1506 uint64_t xor_hash = 0;
de190aef 1507 struct dual_timestamp _ts;
cec736d2
LP
1508
1509 assert(f);
1510 assert(iovec || n_iovec == 0);
1511
de190aef
LP
1512 if (!ts) {
1513 dual_timestamp_get(&_ts);
1514 ts = &_ts;
1515 }
1516
1517 if (f->tail_entry_monotonic_valid &&
1518 ts->monotonic < le64toh(f->header->tail_entry_monotonic))
1519 return -EINVAL;
1520
feb12d3e 1521#ifdef HAVE_GCRYPT
7560fffc
LP
1522 r = journal_file_maybe_append_tag(f, ts->realtime);
1523 if (r < 0)
1524 return r;
feb12d3e 1525#endif
7560fffc 1526
64825d3c 1527 /* alloca() can't take 0, hence let's allocate at least one */
9607d947 1528 items = alloca(sizeof(EntryItem) * MAX(1u, n_iovec));
cec736d2
LP
1529
1530 for (i = 0; i < n_iovec; i++) {
1531 uint64_t p;
1532 Object *o;
1533
1534 r = journal_file_append_data(f, iovec[i].iov_base, iovec[i].iov_len, &o, &p);
1535 if (r < 0)
cf244689 1536 return r;
cec736d2
LP
1537
1538 xor_hash ^= le64toh(o->data.hash);
1539 items[i].object_offset = htole64(p);
de7b95cd 1540 items[i].hash = o->data.hash;
cec736d2
LP
1541 }
1542
1f2da9ec
LP
1543 /* Order by the position on disk, in order to improve seek
1544 * times for rotating media. */
7ff7394d 1545 qsort_safe(items, n_iovec, sizeof(EntryItem), entry_item_cmp);
1f2da9ec 1546
de190aef 1547 r = journal_file_append_entry_internal(f, ts, xor_hash, items, n_iovec, seqnum, ret, offset);
cec736d2 1548
fa6ac760
LP
1549 /* If the memory mapping triggered a SIGBUS then we return an
1550 * IO error and ignore the error code passed down to us, since
1551 * it is very likely just an effect of a nullified replacement
1552 * mapping page */
1553
1554 if (mmap_cache_got_sigbus(f->mmap, f->fd))
1555 r = -EIO;
1556
7a24f3bf
VC
1557 if (f->post_change_timer)
1558 schedule_post_change(f);
1559 else
1560 journal_file_post_change(f);
50f20cfd 1561
cec736d2
LP
1562 return r;
1563}
1564
a4bcff5b 1565typedef struct ChainCacheItem {
fb099c8d 1566 uint64_t first; /* the array at the beginning of the chain */
a4bcff5b
LP
1567 uint64_t array; /* the cached array */
1568 uint64_t begin; /* the first item in the cached array */
1569 uint64_t total; /* the total number of items in all arrays before this one in the chain */
f268980d 1570 uint64_t last_index; /* the last index we looked at, to optimize locality when bisecting */
a4bcff5b
LP
1571} ChainCacheItem;
1572
1573static void chain_cache_put(
4743015d 1574 OrderedHashmap *h,
a4bcff5b
LP
1575 ChainCacheItem *ci,
1576 uint64_t first,
1577 uint64_t array,
1578 uint64_t begin,
f268980d
LP
1579 uint64_t total,
1580 uint64_t last_index) {
a4bcff5b
LP
1581
1582 if (!ci) {
34741aa3
LP
1583 /* If the chain item to cache for this chain is the
1584 * first one it's not worth caching anything */
1585 if (array == first)
1586 return;
1587
29433089 1588 if (ordered_hashmap_size(h) >= CHAIN_CACHE_MAX) {
4743015d 1589 ci = ordered_hashmap_steal_first(h);
29433089
LP
1590 assert(ci);
1591 } else {
a4bcff5b
LP
1592 ci = new(ChainCacheItem, 1);
1593 if (!ci)
1594 return;
1595 }
1596
1597 ci->first = first;
1598
4743015d 1599 if (ordered_hashmap_put(h, &ci->first, ci) < 0) {
a4bcff5b
LP
1600 free(ci);
1601 return;
1602 }
1603 } else
1604 assert(ci->first == first);
1605
1606 ci->array = array;
1607 ci->begin = begin;
1608 ci->total = total;
f268980d 1609 ci->last_index = last_index;
a4bcff5b
LP
1610}
1611
f268980d
LP
1612static int generic_array_get(
1613 JournalFile *f,
1614 uint64_t first,
1615 uint64_t i,
1616 Object **ret, uint64_t *offset) {
de190aef 1617
cec736d2 1618 Object *o;
a4bcff5b 1619 uint64_t p = 0, a, t = 0;
cec736d2 1620 int r;
a4bcff5b 1621 ChainCacheItem *ci;
cec736d2
LP
1622
1623 assert(f);
1624
de190aef 1625 a = first;
a4bcff5b
LP
1626
1627 /* Try the chain cache first */
4743015d 1628 ci = ordered_hashmap_get(f->chain_cache, &first);
a4bcff5b
LP
1629 if (ci && i > ci->total) {
1630 a = ci->array;
1631 i -= ci->total;
1632 t = ci->total;
1633 }
1634
de190aef 1635 while (a > 0) {
a4bcff5b 1636 uint64_t k;
cec736d2 1637
de190aef
LP
1638 r = journal_file_move_to_object(f, OBJECT_ENTRY_ARRAY, a, &o);
1639 if (r < 0)
1640 return r;
cec736d2 1641
a4bcff5b
LP
1642 k = journal_file_entry_array_n_items(o);
1643 if (i < k) {
de190aef 1644 p = le64toh(o->entry_array.items[i]);
a4bcff5b 1645 goto found;
cec736d2
LP
1646 }
1647
a4bcff5b
LP
1648 i -= k;
1649 t += k;
de190aef
LP
1650 a = le64toh(o->entry_array.next_entry_array_offset);
1651 }
1652
a4bcff5b
LP
1653 return 0;
1654
1655found:
1656 /* Let's cache this item for the next invocation */
af13a6b0 1657 chain_cache_put(f->chain_cache, ci, first, a, le64toh(o->entry_array.items[0]), t, i);
de190aef
LP
1658
1659 r = journal_file_move_to_object(f, OBJECT_ENTRY, p, &o);
1660 if (r < 0)
1661 return r;
1662
1663 if (ret)
1664 *ret = o;
1665
1666 if (offset)
1667 *offset = p;
1668
1669 return 1;
1670}
1671
f268980d
LP
1672static int generic_array_get_plus_one(
1673 JournalFile *f,
1674 uint64_t extra,
1675 uint64_t first,
1676 uint64_t i,
1677 Object **ret, uint64_t *offset) {
de190aef
LP
1678
1679 Object *o;
1680
1681 assert(f);
1682
1683 if (i == 0) {
1684 int r;
1685
1686 r = journal_file_move_to_object(f, OBJECT_ENTRY, extra, &o);
cec736d2
LP
1687 if (r < 0)
1688 return r;
1689
de190aef
LP
1690 if (ret)
1691 *ret = o;
cec736d2 1692
de190aef
LP
1693 if (offset)
1694 *offset = extra;
cec736d2 1695
de190aef 1696 return 1;
cec736d2
LP
1697 }
1698
de190aef
LP
1699 return generic_array_get(f, first, i-1, ret, offset);
1700}
cec736d2 1701
de190aef
LP
1702enum {
1703 TEST_FOUND,
1704 TEST_LEFT,
1705 TEST_RIGHT
1706};
cec736d2 1707
f268980d
LP
1708static int generic_array_bisect(
1709 JournalFile *f,
1710 uint64_t first,
1711 uint64_t n,
1712 uint64_t needle,
1713 int (*test_object)(JournalFile *f, uint64_t p, uint64_t needle),
1714 direction_t direction,
1715 Object **ret,
1716 uint64_t *offset,
1717 uint64_t *idx) {
1718
1719 uint64_t a, p, t = 0, i = 0, last_p = 0, last_index = (uint64_t) -1;
de190aef
LP
1720 bool subtract_one = false;
1721 Object *o, *array = NULL;
1722 int r;
a4bcff5b 1723 ChainCacheItem *ci;
cec736d2 1724
de190aef
LP
1725 assert(f);
1726 assert(test_object);
cec736d2 1727
a4bcff5b 1728 /* Start with the first array in the chain */
de190aef 1729 a = first;
a4bcff5b 1730
4743015d 1731 ci = ordered_hashmap_get(f->chain_cache, &first);
a4bcff5b
LP
1732 if (ci && n > ci->total) {
1733 /* Ah, we have iterated this bisection array chain
1734 * previously! Let's see if we can skip ahead in the
1735 * chain, as far as the last time. But we can't jump
1736 * backwards in the chain, so let's check that
1737 * first. */
1738
1739 r = test_object(f, ci->begin, needle);
1740 if (r < 0)
1741 return r;
1742
1743 if (r == TEST_LEFT) {
f268980d 1744 /* OK, what we are looking for is right of the
a4bcff5b
LP
1745 * begin of this EntryArray, so let's jump
1746 * straight to previously cached array in the
1747 * chain */
1748
1749 a = ci->array;
1750 n -= ci->total;
1751 t = ci->total;
f268980d 1752 last_index = ci->last_index;
a4bcff5b
LP
1753 }
1754 }
1755
de190aef
LP
1756 while (a > 0) {
1757 uint64_t left, right, k, lp;
1758
1759 r = journal_file_move_to_object(f, OBJECT_ENTRY_ARRAY, a, &array);
cec736d2
LP
1760 if (r < 0)
1761 return r;
1762
de190aef
LP
1763 k = journal_file_entry_array_n_items(array);
1764 right = MIN(k, n);
1765 if (right <= 0)
1766 return 0;
cec736d2 1767
de190aef
LP
1768 i = right - 1;
1769 lp = p = le64toh(array->entry_array.items[i]);
1770 if (p <= 0)
1771 return -EBADMSG;
cec736d2 1772
de190aef
LP
1773 r = test_object(f, p, needle);
1774 if (r < 0)
1775 return r;
cec736d2 1776
de190aef
LP
1777 if (r == TEST_FOUND)
1778 r = direction == DIRECTION_DOWN ? TEST_RIGHT : TEST_LEFT;
1779
1780 if (r == TEST_RIGHT) {
1781 left = 0;
1782 right -= 1;
f268980d
LP
1783
1784 if (last_index != (uint64_t) -1) {
1785 assert(last_index <= right);
1786
1787 /* If we cached the last index we
1788 * looked at, let's try to not to jump
1789 * too wildly around and see if we can
1790 * limit the range to look at early to
1791 * the immediate neighbors of the last
1792 * index we looked at. */
1793
1794 if (last_index > 0) {
1795 uint64_t x = last_index - 1;
1796
1797 p = le64toh(array->entry_array.items[x]);
1798 if (p <= 0)
1799 return -EBADMSG;
1800
1801 r = test_object(f, p, needle);
1802 if (r < 0)
1803 return r;
1804
1805 if (r == TEST_FOUND)
1806 r = direction == DIRECTION_DOWN ? TEST_RIGHT : TEST_LEFT;
1807
1808 if (r == TEST_RIGHT)
1809 right = x;
1810 else
1811 left = x + 1;
1812 }
1813
1814 if (last_index < right) {
1815 uint64_t y = last_index + 1;
1816
1817 p = le64toh(array->entry_array.items[y]);
1818 if (p <= 0)
1819 return -EBADMSG;
1820
1821 r = test_object(f, p, needle);
1822 if (r < 0)
1823 return r;
1824
1825 if (r == TEST_FOUND)
1826 r = direction == DIRECTION_DOWN ? TEST_RIGHT : TEST_LEFT;
1827
1828 if (r == TEST_RIGHT)
1829 right = y;
1830 else
1831 left = y + 1;
1832 }
f268980d
LP
1833 }
1834
de190aef
LP
1835 for (;;) {
1836 if (left == right) {
1837 if (direction == DIRECTION_UP)
1838 subtract_one = true;
1839
1840 i = left;
1841 goto found;
1842 }
1843
1844 assert(left < right);
de190aef 1845 i = (left + right) / 2;
f268980d 1846
de190aef
LP
1847 p = le64toh(array->entry_array.items[i]);
1848 if (p <= 0)
1849 return -EBADMSG;
1850
1851 r = test_object(f, p, needle);
1852 if (r < 0)
1853 return r;
cec736d2 1854
de190aef
LP
1855 if (r == TEST_FOUND)
1856 r = direction == DIRECTION_DOWN ? TEST_RIGHT : TEST_LEFT;
1857
1858 if (r == TEST_RIGHT)
1859 right = i;
1860 else
1861 left = i + 1;
1862 }
1863 }
1864
2173cbf8 1865 if (k >= n) {
cbdca852
LP
1866 if (direction == DIRECTION_UP) {
1867 i = n;
1868 subtract_one = true;
1869 goto found;
1870 }
1871
cec736d2 1872 return 0;
cbdca852 1873 }
cec736d2 1874
de190aef
LP
1875 last_p = lp;
1876
1877 n -= k;
1878 t += k;
f268980d 1879 last_index = (uint64_t) -1;
de190aef 1880 a = le64toh(array->entry_array.next_entry_array_offset);
cec736d2
LP
1881 }
1882
1883 return 0;
de190aef
LP
1884
1885found:
1886 if (subtract_one && t == 0 && i == 0)
1887 return 0;
1888
a4bcff5b 1889 /* Let's cache this item for the next invocation */
af13a6b0 1890 chain_cache_put(f->chain_cache, ci, first, a, le64toh(array->entry_array.items[0]), t, subtract_one ? (i > 0 ? i-1 : (uint64_t) -1) : i);
a4bcff5b 1891
de190aef
LP
1892 if (subtract_one && i == 0)
1893 p = last_p;
1894 else if (subtract_one)
1895 p = le64toh(array->entry_array.items[i-1]);
1896 else
1897 p = le64toh(array->entry_array.items[i]);
1898
1899 r = journal_file_move_to_object(f, OBJECT_ENTRY, p, &o);
1900 if (r < 0)
1901 return r;
1902
1903 if (ret)
1904 *ret = o;
1905
1906 if (offset)
1907 *offset = p;
1908
1909 if (idx)
cbdca852 1910 *idx = t + i + (subtract_one ? -1 : 0);
de190aef
LP
1911
1912 return 1;
cec736d2
LP
1913}
1914
f268980d
LP
1915static int generic_array_bisect_plus_one(
1916 JournalFile *f,
1917 uint64_t extra,
1918 uint64_t first,
1919 uint64_t n,
1920 uint64_t needle,
1921 int (*test_object)(JournalFile *f, uint64_t p, uint64_t needle),
1922 direction_t direction,
1923 Object **ret,
1924 uint64_t *offset,
1925 uint64_t *idx) {
de190aef 1926
cec736d2 1927 int r;
cbdca852
LP
1928 bool step_back = false;
1929 Object *o;
cec736d2
LP
1930
1931 assert(f);
de190aef 1932 assert(test_object);
cec736d2 1933
de190aef
LP
1934 if (n <= 0)
1935 return 0;
cec736d2 1936
de190aef
LP
1937 /* This bisects the array in object 'first', but first checks
1938 * an extra */
de190aef
LP
1939 r = test_object(f, extra, needle);
1940 if (r < 0)
1941 return r;
a536e261
LP
1942
1943 if (r == TEST_FOUND)
1944 r = direction == DIRECTION_DOWN ? TEST_RIGHT : TEST_LEFT;
1945
cbdca852
LP
1946 /* if we are looking with DIRECTION_UP then we need to first
1947 see if in the actual array there is a matching entry, and
1948 return the last one of that. But if there isn't any we need
1949 to return this one. Hence remember this, and return it
1950 below. */
1951 if (r == TEST_LEFT)
1952 step_back = direction == DIRECTION_UP;
de190aef 1953
cbdca852
LP
1954 if (r == TEST_RIGHT) {
1955 if (direction == DIRECTION_DOWN)
1956 goto found;
1957 else
1958 return 0;
a536e261 1959 }
cec736d2 1960
de190aef
LP
1961 r = generic_array_bisect(f, first, n-1, needle, test_object, direction, ret, offset, idx);
1962
cbdca852
LP
1963 if (r == 0 && step_back)
1964 goto found;
1965
ecf68b1d 1966 if (r > 0 && idx)
de190aef
LP
1967 (*idx) ++;
1968
1969 return r;
cbdca852
LP
1970
1971found:
1972 r = journal_file_move_to_object(f, OBJECT_ENTRY, extra, &o);
1973 if (r < 0)
1974 return r;
1975
1976 if (ret)
1977 *ret = o;
1978
1979 if (offset)
1980 *offset = extra;
1981
1982 if (idx)
1983 *idx = 0;
1984
1985 return 1;
1986}
1987
44a6b1b6 1988_pure_ static int test_object_offset(JournalFile *f, uint64_t p, uint64_t needle) {
cbdca852
LP
1989 assert(f);
1990 assert(p > 0);
1991
1992 if (p == needle)
1993 return TEST_FOUND;
1994 else if (p < needle)
1995 return TEST_LEFT;
1996 else
1997 return TEST_RIGHT;
1998}
1999
de190aef
LP
2000static int test_object_seqnum(JournalFile *f, uint64_t p, uint64_t needle) {
2001 Object *o;
2002 int r;
2003
2004 assert(f);
2005 assert(p > 0);
2006
2007 r = journal_file_move_to_object(f, OBJECT_ENTRY, p, &o);
cec736d2
LP
2008 if (r < 0)
2009 return r;
2010
de190aef
LP
2011 if (le64toh(o->entry.seqnum) == needle)
2012 return TEST_FOUND;
2013 else if (le64toh(o->entry.seqnum) < needle)
2014 return TEST_LEFT;
2015 else
2016 return TEST_RIGHT;
2017}
cec736d2 2018
de190aef
LP
2019int journal_file_move_to_entry_by_seqnum(
2020 JournalFile *f,
2021 uint64_t seqnum,
2022 direction_t direction,
2023 Object **ret,
2024 uint64_t *offset) {
2025
2026 return generic_array_bisect(f,
2027 le64toh(f->header->entry_array_offset),
2028 le64toh(f->header->n_entries),
2029 seqnum,
2030 test_object_seqnum,
2031 direction,
2032 ret, offset, NULL);
2033}
cec736d2 2034
de190aef
LP
2035static int test_object_realtime(JournalFile *f, uint64_t p, uint64_t needle) {
2036 Object *o;
2037 int r;
2038
2039 assert(f);
2040 assert(p > 0);
2041
2042 r = journal_file_move_to_object(f, OBJECT_ENTRY, p, &o);
2043 if (r < 0)
2044 return r;
2045
2046 if (le64toh(o->entry.realtime) == needle)
2047 return TEST_FOUND;
2048 else if (le64toh(o->entry.realtime) < needle)
2049 return TEST_LEFT;
2050 else
2051 return TEST_RIGHT;
cec736d2
LP
2052}
2053
de190aef
LP
2054int journal_file_move_to_entry_by_realtime(
2055 JournalFile *f,
2056 uint64_t realtime,
2057 direction_t direction,
2058 Object **ret,
2059 uint64_t *offset) {
2060
2061 return generic_array_bisect(f,
2062 le64toh(f->header->entry_array_offset),
2063 le64toh(f->header->n_entries),
2064 realtime,
2065 test_object_realtime,
2066 direction,
2067 ret, offset, NULL);
2068}
2069
2070static int test_object_monotonic(JournalFile *f, uint64_t p, uint64_t needle) {
2071 Object *o;
2072 int r;
2073
2074 assert(f);
2075 assert(p > 0);
2076
2077 r = journal_file_move_to_object(f, OBJECT_ENTRY, p, &o);
2078 if (r < 0)
2079 return r;
2080
2081 if (le64toh(o->entry.monotonic) == needle)
2082 return TEST_FOUND;
2083 else if (le64toh(o->entry.monotonic) < needle)
2084 return TEST_LEFT;
2085 else
2086 return TEST_RIGHT;
2087}
2088
2a560338 2089static int find_data_object_by_boot_id(
47838ab3
ZJS
2090 JournalFile *f,
2091 sd_id128_t boot_id,
2092 Object **o,
2093 uint64_t *b) {
2a560338 2094
47838ab3
ZJS
2095 char t[sizeof("_BOOT_ID=")-1 + 32 + 1] = "_BOOT_ID=";
2096
2097 sd_id128_to_string(boot_id, t + 9);
2098 return journal_file_find_data_object(f, t, sizeof(t) - 1, o, b);
2099}
2100
de190aef
LP
2101int journal_file_move_to_entry_by_monotonic(
2102 JournalFile *f,
2103 sd_id128_t boot_id,
2104 uint64_t monotonic,
2105 direction_t direction,
2106 Object **ret,
2107 uint64_t *offset) {
2108
de190aef
LP
2109 Object *o;
2110 int r;
2111
cbdca852 2112 assert(f);
de190aef 2113
47838ab3 2114 r = find_data_object_by_boot_id(f, boot_id, &o, NULL);
de190aef
LP
2115 if (r < 0)
2116 return r;
cbdca852 2117 if (r == 0)
de190aef
LP
2118 return -ENOENT;
2119
2120 return generic_array_bisect_plus_one(f,
2121 le64toh(o->data.entry_offset),
2122 le64toh(o->data.entry_array_offset),
2123 le64toh(o->data.n_entries),
2124 monotonic,
2125 test_object_monotonic,
2126 direction,
2127 ret, offset, NULL);
2128}
2129
1fc605b0 2130void journal_file_reset_location(JournalFile *f) {
6573ef05 2131 f->location_type = LOCATION_HEAD;
1fc605b0 2132 f->current_offset = 0;
6573ef05
MS
2133 f->current_seqnum = 0;
2134 f->current_realtime = 0;
2135 f->current_monotonic = 0;
2136 zero(f->current_boot_id);
2137 f->current_xor_hash = 0;
2138}
2139
950c07d4 2140void journal_file_save_location(JournalFile *f, Object *o, uint64_t offset) {
6573ef05
MS
2141 f->location_type = LOCATION_SEEK;
2142 f->current_offset = offset;
2143 f->current_seqnum = le64toh(o->entry.seqnum);
2144 f->current_realtime = le64toh(o->entry.realtime);
2145 f->current_monotonic = le64toh(o->entry.monotonic);
2146 f->current_boot_id = o->entry.boot_id;
2147 f->current_xor_hash = le64toh(o->entry.xor_hash);
1fc605b0
MS
2148}
2149
d8ae66d7
MS
2150int journal_file_compare_locations(JournalFile *af, JournalFile *bf) {
2151 assert(af);
2152 assert(bf);
2153 assert(af->location_type == LOCATION_SEEK);
2154 assert(bf->location_type == LOCATION_SEEK);
2155
2156 /* If contents and timestamps match, these entries are
2157 * identical, even if the seqnum does not match */
2158 if (sd_id128_equal(af->current_boot_id, bf->current_boot_id) &&
2159 af->current_monotonic == bf->current_monotonic &&
2160 af->current_realtime == bf->current_realtime &&
2161 af->current_xor_hash == bf->current_xor_hash)
2162 return 0;
2163
2164 if (sd_id128_equal(af->header->seqnum_id, bf->header->seqnum_id)) {
2165
2166 /* If this is from the same seqnum source, compare
2167 * seqnums */
2168 if (af->current_seqnum < bf->current_seqnum)
2169 return -1;
2170 if (af->current_seqnum > bf->current_seqnum)
2171 return 1;
2172
2173 /* Wow! This is weird, different data but the same
2174 * seqnums? Something is borked, but let's make the
2175 * best of it and compare by time. */
2176 }
2177
2178 if (sd_id128_equal(af->current_boot_id, bf->current_boot_id)) {
2179
2180 /* If the boot id matches, compare monotonic time */
2181 if (af->current_monotonic < bf->current_monotonic)
2182 return -1;
2183 if (af->current_monotonic > bf->current_monotonic)
2184 return 1;
2185 }
2186
2187 /* Otherwise, compare UTC time */
2188 if (af->current_realtime < bf->current_realtime)
2189 return -1;
2190 if (af->current_realtime > bf->current_realtime)
2191 return 1;
2192
2193 /* Finally, compare by contents */
2194 if (af->current_xor_hash < bf->current_xor_hash)
2195 return -1;
2196 if (af->current_xor_hash > bf->current_xor_hash)
2197 return 1;
2198
2199 return 0;
2200}
2201
de190aef
LP
2202int journal_file_next_entry(
2203 JournalFile *f,
f534928a 2204 uint64_t p,
de190aef
LP
2205 direction_t direction,
2206 Object **ret, uint64_t *offset) {
2207
fb099c8d 2208 uint64_t i, n, ofs;
cec736d2
LP
2209 int r;
2210
2211 assert(f);
de190aef
LP
2212
2213 n = le64toh(f->header->n_entries);
2214 if (n <= 0)
2215 return 0;
cec736d2 2216
f534928a 2217 if (p == 0)
de190aef 2218 i = direction == DIRECTION_DOWN ? 0 : n - 1;
cec736d2 2219 else {
de190aef
LP
2220 r = generic_array_bisect(f,
2221 le64toh(f->header->entry_array_offset),
2222 le64toh(f->header->n_entries),
2223 p,
2224 test_object_offset,
2225 DIRECTION_DOWN,
2226 NULL, NULL,
2227 &i);
2228 if (r <= 0)
2229 return r;
2230
2231 if (direction == DIRECTION_DOWN) {
2232 if (i >= n - 1)
2233 return 0;
2234
2235 i++;
2236 } else {
2237 if (i <= 0)
2238 return 0;
2239
2240 i--;
2241 }
cec736d2
LP
2242 }
2243
de190aef 2244 /* And jump to it */
fb099c8d
ZJS
2245 r = generic_array_get(f,
2246 le64toh(f->header->entry_array_offset),
2247 i,
2248 ret, &ofs);
2249 if (r <= 0)
2250 return r;
2251
2252 if (p > 0 &&
2253 (direction == DIRECTION_DOWN ? ofs <= p : ofs >= p)) {
2254 log_debug("%s: entry array corrupted at entry %"PRIu64,
2255 f->path, i);
2256 return -EBADMSG;
2257 }
2258
2259 if (offset)
2260 *offset = ofs;
2261
2262 return 1;
de190aef 2263}
cec736d2 2264
de190aef
LP
2265int journal_file_next_entry_for_data(
2266 JournalFile *f,
2267 Object *o, uint64_t p,
2268 uint64_t data_offset,
2269 direction_t direction,
2270 Object **ret, uint64_t *offset) {
2271
2272 uint64_t n, i;
cec736d2 2273 int r;
de190aef 2274 Object *d;
cec736d2
LP
2275
2276 assert(f);
de190aef 2277 assert(p > 0 || !o);
cec736d2 2278
de190aef 2279 r = journal_file_move_to_object(f, OBJECT_DATA, data_offset, &d);
466ccd92 2280 if (r < 0)
de190aef 2281 return r;
cec736d2 2282
de190aef
LP
2283 n = le64toh(d->data.n_entries);
2284 if (n <= 0)
2285 return n;
cec736d2 2286
de190aef
LP
2287 if (!o)
2288 i = direction == DIRECTION_DOWN ? 0 : n - 1;
2289 else {
2290 if (o->object.type != OBJECT_ENTRY)
2291 return -EINVAL;
cec736d2 2292
de190aef
LP
2293 r = generic_array_bisect_plus_one(f,
2294 le64toh(d->data.entry_offset),
2295 le64toh(d->data.entry_array_offset),
2296 le64toh(d->data.n_entries),
2297 p,
2298 test_object_offset,
2299 DIRECTION_DOWN,
2300 NULL, NULL,
2301 &i);
2302
2303 if (r <= 0)
cec736d2
LP
2304 return r;
2305
de190aef
LP
2306 if (direction == DIRECTION_DOWN) {
2307 if (i >= n - 1)
2308 return 0;
cec736d2 2309
de190aef
LP
2310 i++;
2311 } else {
2312 if (i <= 0)
2313 return 0;
cec736d2 2314
de190aef
LP
2315 i--;
2316 }
cec736d2 2317
de190aef 2318 }
cec736d2 2319
de190aef
LP
2320 return generic_array_get_plus_one(f,
2321 le64toh(d->data.entry_offset),
2322 le64toh(d->data.entry_array_offset),
2323 i,
2324 ret, offset);
2325}
cec736d2 2326
cbdca852
LP
2327int journal_file_move_to_entry_by_offset_for_data(
2328 JournalFile *f,
2329 uint64_t data_offset,
2330 uint64_t p,
2331 direction_t direction,
2332 Object **ret, uint64_t *offset) {
2333
2334 int r;
2335 Object *d;
2336
2337 assert(f);
2338
2339 r = journal_file_move_to_object(f, OBJECT_DATA, data_offset, &d);
2340 if (r < 0)
2341 return r;
2342
2343 return generic_array_bisect_plus_one(f,
2344 le64toh(d->data.entry_offset),
2345 le64toh(d->data.entry_array_offset),
2346 le64toh(d->data.n_entries),
2347 p,
2348 test_object_offset,
2349 direction,
2350 ret, offset, NULL);
2351}
2352
2353int journal_file_move_to_entry_by_monotonic_for_data(
2354 JournalFile *f,
2355 uint64_t data_offset,
2356 sd_id128_t boot_id,
2357 uint64_t monotonic,
2358 direction_t direction,
2359 Object **ret, uint64_t *offset) {
2360
cbdca852
LP
2361 Object *o, *d;
2362 int r;
2363 uint64_t b, z;
2364
2365 assert(f);
2366
2367 /* First, seek by time */
47838ab3 2368 r = find_data_object_by_boot_id(f, boot_id, &o, &b);
cbdca852
LP
2369 if (r < 0)
2370 return r;
2371 if (r == 0)
2372 return -ENOENT;
2373
2374 r = generic_array_bisect_plus_one(f,
2375 le64toh(o->data.entry_offset),
2376 le64toh(o->data.entry_array_offset),
2377 le64toh(o->data.n_entries),
2378 monotonic,
2379 test_object_monotonic,
2380 direction,
2381 NULL, &z, NULL);
2382 if (r <= 0)
2383 return r;
2384
2385 /* And now, continue seeking until we find an entry that
2386 * exists in both bisection arrays */
2387
2388 for (;;) {
2389 Object *qo;
2390 uint64_t p, q;
2391
2392 r = journal_file_move_to_object(f, OBJECT_DATA, data_offset, &d);
2393 if (r < 0)
2394 return r;
2395
2396 r = generic_array_bisect_plus_one(f,
2397 le64toh(d->data.entry_offset),
2398 le64toh(d->data.entry_array_offset),
2399 le64toh(d->data.n_entries),
2400 z,
2401 test_object_offset,
2402 direction,
2403 NULL, &p, NULL);
2404 if (r <= 0)
2405 return r;
2406
2407 r = journal_file_move_to_object(f, OBJECT_DATA, b, &o);
2408 if (r < 0)
2409 return r;
2410
2411 r = generic_array_bisect_plus_one(f,
2412 le64toh(o->data.entry_offset),
2413 le64toh(o->data.entry_array_offset),
2414 le64toh(o->data.n_entries),
2415 p,
2416 test_object_offset,
2417 direction,
2418 &qo, &q, NULL);
2419
2420 if (r <= 0)
2421 return r;
2422
2423 if (p == q) {
2424 if (ret)
2425 *ret = qo;
2426 if (offset)
2427 *offset = q;
2428
2429 return 1;
2430 }
2431
2432 z = q;
2433 }
cbdca852
LP
2434}
2435
de190aef
LP
2436int journal_file_move_to_entry_by_seqnum_for_data(
2437 JournalFile *f,
2438 uint64_t data_offset,
2439 uint64_t seqnum,
2440 direction_t direction,
2441 Object **ret, uint64_t *offset) {
cec736d2 2442
de190aef
LP
2443 Object *d;
2444 int r;
cec736d2 2445
91a31dde
LP
2446 assert(f);
2447
de190aef 2448 r = journal_file_move_to_object(f, OBJECT_DATA, data_offset, &d);
91a31dde 2449 if (r < 0)
de190aef 2450 return r;
cec736d2 2451
de190aef
LP
2452 return generic_array_bisect_plus_one(f,
2453 le64toh(d->data.entry_offset),
2454 le64toh(d->data.entry_array_offset),
2455 le64toh(d->data.n_entries),
2456 seqnum,
2457 test_object_seqnum,
2458 direction,
2459 ret, offset, NULL);
2460}
cec736d2 2461
de190aef
LP
2462int journal_file_move_to_entry_by_realtime_for_data(
2463 JournalFile *f,
2464 uint64_t data_offset,
2465 uint64_t realtime,
2466 direction_t direction,
2467 Object **ret, uint64_t *offset) {
2468
2469 Object *d;
2470 int r;
2471
91a31dde
LP
2472 assert(f);
2473
de190aef 2474 r = journal_file_move_to_object(f, OBJECT_DATA, data_offset, &d);
91a31dde 2475 if (r < 0)
de190aef
LP
2476 return r;
2477
2478 return generic_array_bisect_plus_one(f,
2479 le64toh(d->data.entry_offset),
2480 le64toh(d->data.entry_array_offset),
2481 le64toh(d->data.n_entries),
2482 realtime,
2483 test_object_realtime,
2484 direction,
2485 ret, offset, NULL);
cec736d2
LP
2486}
2487
0284adc6 2488void journal_file_dump(JournalFile *f) {
7560fffc 2489 Object *o;
7560fffc 2490 int r;
0284adc6 2491 uint64_t p;
7560fffc
LP
2492
2493 assert(f);
2494
0284adc6 2495 journal_file_print_header(f);
7560fffc 2496
0284adc6
LP
2497 p = le64toh(f->header->header_size);
2498 while (p != 0) {
d05089d8 2499 r = journal_file_move_to_object(f, OBJECT_UNUSED, p, &o);
0284adc6
LP
2500 if (r < 0)
2501 goto fail;
7560fffc 2502
0284adc6 2503 switch (o->object.type) {
d98cc1f2 2504
0284adc6
LP
2505 case OBJECT_UNUSED:
2506 printf("Type: OBJECT_UNUSED\n");
2507 break;
d98cc1f2 2508
0284adc6
LP
2509 case OBJECT_DATA:
2510 printf("Type: OBJECT_DATA\n");
2511 break;
7560fffc 2512
3c1668da
LP
2513 case OBJECT_FIELD:
2514 printf("Type: OBJECT_FIELD\n");
2515 break;
2516
0284adc6 2517 case OBJECT_ENTRY:
507f22bd
ZJS
2518 printf("Type: OBJECT_ENTRY seqnum=%"PRIu64" monotonic=%"PRIu64" realtime=%"PRIu64"\n",
2519 le64toh(o->entry.seqnum),
2520 le64toh(o->entry.monotonic),
2521 le64toh(o->entry.realtime));
0284adc6 2522 break;
7560fffc 2523
0284adc6
LP
2524 case OBJECT_FIELD_HASH_TABLE:
2525 printf("Type: OBJECT_FIELD_HASH_TABLE\n");
2526 break;
7560fffc 2527
0284adc6
LP
2528 case OBJECT_DATA_HASH_TABLE:
2529 printf("Type: OBJECT_DATA_HASH_TABLE\n");
2530 break;
7560fffc 2531
0284adc6
LP
2532 case OBJECT_ENTRY_ARRAY:
2533 printf("Type: OBJECT_ENTRY_ARRAY\n");
2534 break;
7560fffc 2535
0284adc6 2536 case OBJECT_TAG:
507f22bd
ZJS
2537 printf("Type: OBJECT_TAG seqnum=%"PRIu64" epoch=%"PRIu64"\n",
2538 le64toh(o->tag.seqnum),
2539 le64toh(o->tag.epoch));
0284adc6 2540 break;
3c1668da
LP
2541
2542 default:
8facc349 2543 printf("Type: unknown (%i)\n", o->object.type);
3c1668da 2544 break;
0284adc6 2545 }
7560fffc 2546
d89c8fdf
ZJS
2547 if (o->object.flags & OBJECT_COMPRESSION_MASK)
2548 printf("Flags: %s\n",
2549 object_compressed_to_string(o->object.flags & OBJECT_COMPRESSION_MASK));
7560fffc 2550
0284adc6
LP
2551 if (p == le64toh(f->header->tail_object_offset))
2552 p = 0;
2553 else
2554 p = p + ALIGN64(le64toh(o->object.size));
2555 }
7560fffc 2556
0284adc6
LP
2557 return;
2558fail:
2559 log_error("File corrupt");
7560fffc
LP
2560}
2561
718fe4b1
ZJS
2562static const char* format_timestamp_safe(char *buf, size_t l, usec_t t) {
2563 const char *x;
2564
2565 x = format_timestamp(buf, l, t);
2566 if (x)
2567 return x;
2568 return " --- ";
2569}
2570
0284adc6 2571void journal_file_print_header(JournalFile *f) {
2765b7bb 2572 char a[33], b[33], c[33], d[33];
ed375beb 2573 char x[FORMAT_TIMESTAMP_MAX], y[FORMAT_TIMESTAMP_MAX], z[FORMAT_TIMESTAMP_MAX];
a1a03e30
LP
2574 struct stat st;
2575 char bytes[FORMAT_BYTES_MAX];
7560fffc
LP
2576
2577 assert(f);
7560fffc 2578
0284adc6
LP
2579 printf("File Path: %s\n"
2580 "File ID: %s\n"
2581 "Machine ID: %s\n"
2582 "Boot ID: %s\n"
2583 "Sequential Number ID: %s\n"
2584 "State: %s\n"
2585 "Compatible Flags:%s%s\n"
d89c8fdf 2586 "Incompatible Flags:%s%s%s\n"
507f22bd
ZJS
2587 "Header size: %"PRIu64"\n"
2588 "Arena size: %"PRIu64"\n"
2589 "Data Hash Table Size: %"PRIu64"\n"
2590 "Field Hash Table Size: %"PRIu64"\n"
0284adc6 2591 "Rotate Suggested: %s\n"
507f22bd
ZJS
2592 "Head Sequential Number: %"PRIu64"\n"
2593 "Tail Sequential Number: %"PRIu64"\n"
0284adc6 2594 "Head Realtime Timestamp: %s\n"
3223f44f 2595 "Tail Realtime Timestamp: %s\n"
ed375beb 2596 "Tail Monotonic Timestamp: %s\n"
507f22bd
ZJS
2597 "Objects: %"PRIu64"\n"
2598 "Entry Objects: %"PRIu64"\n",
0284adc6
LP
2599 f->path,
2600 sd_id128_to_string(f->header->file_id, a),
2601 sd_id128_to_string(f->header->machine_id, b),
2602 sd_id128_to_string(f->header->boot_id, c),
2765b7bb 2603 sd_id128_to_string(f->header->seqnum_id, d),
3223f44f
LP
2604 f->header->state == STATE_OFFLINE ? "OFFLINE" :
2605 f->header->state == STATE_ONLINE ? "ONLINE" :
2606 f->header->state == STATE_ARCHIVED ? "ARCHIVED" : "UNKNOWN",
8088cbd3 2607 JOURNAL_HEADER_SEALED(f->header) ? " SEALED" : "",
d89c8fdf
ZJS
2608 (le32toh(f->header->compatible_flags) & ~HEADER_COMPATIBLE_ANY) ? " ???" : "",
2609 JOURNAL_HEADER_COMPRESSED_XZ(f->header) ? " COMPRESSED-XZ" : "",
2610 JOURNAL_HEADER_COMPRESSED_LZ4(f->header) ? " COMPRESSED-LZ4" : "",
2611 (le32toh(f->header->incompatible_flags) & ~HEADER_INCOMPATIBLE_ANY) ? " ???" : "",
507f22bd
ZJS
2612 le64toh(f->header->header_size),
2613 le64toh(f->header->arena_size),
2614 le64toh(f->header->data_hash_table_size) / sizeof(HashItem),
2615 le64toh(f->header->field_hash_table_size) / sizeof(HashItem),
fb0951b0 2616 yes_no(journal_file_rotate_suggested(f, 0)),
507f22bd
ZJS
2617 le64toh(f->header->head_entry_seqnum),
2618 le64toh(f->header->tail_entry_seqnum),
718fe4b1
ZJS
2619 format_timestamp_safe(x, sizeof(x), le64toh(f->header->head_entry_realtime)),
2620 format_timestamp_safe(y, sizeof(y), le64toh(f->header->tail_entry_realtime)),
ed375beb 2621 format_timespan(z, sizeof(z), le64toh(f->header->tail_entry_monotonic), USEC_PER_MSEC),
507f22bd
ZJS
2622 le64toh(f->header->n_objects),
2623 le64toh(f->header->n_entries));
7560fffc 2624
0284adc6 2625 if (JOURNAL_HEADER_CONTAINS(f->header, n_data))
507f22bd 2626 printf("Data Objects: %"PRIu64"\n"
0284adc6 2627 "Data Hash Table Fill: %.1f%%\n",
507f22bd 2628 le64toh(f->header->n_data),
0284adc6 2629 100.0 * (double) le64toh(f->header->n_data) / ((double) (le64toh(f->header->data_hash_table_size) / sizeof(HashItem))));
7560fffc 2630
0284adc6 2631 if (JOURNAL_HEADER_CONTAINS(f->header, n_fields))
507f22bd 2632 printf("Field Objects: %"PRIu64"\n"
0284adc6 2633 "Field Hash Table Fill: %.1f%%\n",
507f22bd 2634 le64toh(f->header->n_fields),
0284adc6 2635 100.0 * (double) le64toh(f->header->n_fields) / ((double) (le64toh(f->header->field_hash_table_size) / sizeof(HashItem))));
3223f44f
LP
2636
2637 if (JOURNAL_HEADER_CONTAINS(f->header, n_tags))
507f22bd
ZJS
2638 printf("Tag Objects: %"PRIu64"\n",
2639 le64toh(f->header->n_tags));
3223f44f 2640 if (JOURNAL_HEADER_CONTAINS(f->header, n_entry_arrays))
507f22bd
ZJS
2641 printf("Entry Array Objects: %"PRIu64"\n",
2642 le64toh(f->header->n_entry_arrays));
a1a03e30
LP
2643
2644 if (fstat(f->fd, &st) >= 0)
59f448cf 2645 printf("Disk usage: %s\n", format_bytes(bytes, sizeof(bytes), (uint64_t) st.st_blocks * 512ULL));
7560fffc
LP
2646}
2647
fc68c929
LP
2648static int journal_file_warn_btrfs(JournalFile *f) {
2649 unsigned attrs;
2650 int r;
2651
2652 assert(f);
2653
2654 /* Before we write anything, check if the COW logic is turned
2655 * off on btrfs. Given our write pattern that is quite
2656 * unfriendly to COW file systems this should greatly improve
2657 * performance on COW file systems, such as btrfs, at the
2658 * expense of data integrity features (which shouldn't be too
2659 * bad, given that we do our own checksumming). */
2660
2661 r = btrfs_is_filesystem(f->fd);
2662 if (r < 0)
2663 return log_warning_errno(r, "Failed to determine if journal is on btrfs: %m");
2664 if (!r)
2665 return 0;
2666
2667 r = read_attr_fd(f->fd, &attrs);
2668 if (r < 0)
2669 return log_warning_errno(r, "Failed to read file attributes: %m");
2670
2671 if (attrs & FS_NOCOW_FL) {
2672 log_debug("Detected btrfs file system with copy-on-write disabled, all is good.");
2673 return 0;
2674 }
2675
2676 log_notice("Creating journal file %s on a btrfs file system, and copy-on-write is enabled. "
2677 "This is likely to slow down journal access substantially, please consider turning "
2678 "off the copy-on-write file attribute on the journal directory, using chattr +C.", f->path);
2679
2680 return 1;
2681}
2682
0284adc6
LP
2683int journal_file_open(
2684 const char *fname,
2685 int flags,
2686 mode_t mode,
2687 bool compress,
baed47c3 2688 bool seal,
0284adc6
LP
2689 JournalMetrics *metrics,
2690 MMapCache *mmap_cache,
2691 JournalFile *template,
2692 JournalFile **ret) {
7560fffc 2693
fa6ac760 2694 bool newly_created = false;
0284adc6 2695 JournalFile *f;
fa6ac760 2696 void *h;
0284adc6 2697 int r;
7560fffc 2698
0284adc6 2699 assert(fname);
0559d3a5 2700 assert(ret);
7560fffc 2701
0284adc6
LP
2702 if ((flags & O_ACCMODE) != O_RDONLY &&
2703 (flags & O_ACCMODE) != O_RDWR)
2704 return -EINVAL;
7560fffc 2705
a0108012
LP
2706 if (!endswith(fname, ".journal") &&
2707 !endswith(fname, ".journal~"))
0284adc6 2708 return -EINVAL;
7560fffc 2709
0284adc6
LP
2710 f = new0(JournalFile, 1);
2711 if (!f)
2712 return -ENOMEM;
7560fffc 2713
0284adc6
LP
2714 f->fd = -1;
2715 f->mode = mode;
7560fffc 2716
0284adc6
LP
2717 f->flags = flags;
2718 f->prot = prot_from_flags(flags);
2719 f->writable = (flags & O_ACCMODE) != O_RDONLY;
92261977 2720#if defined(HAVE_LZ4)
d89c8fdf
ZJS
2721 f->compress_lz4 = compress;
2722#elif defined(HAVE_XZ)
2723 f->compress_xz = compress;
48b61739 2724#endif
49a32d43 2725#ifdef HAVE_GCRYPT
baed47c3 2726 f->seal = seal;
49a32d43 2727#endif
7560fffc 2728
0284adc6
LP
2729 if (mmap_cache)
2730 f->mmap = mmap_cache_ref(mmap_cache);
2731 else {
84168d80 2732 f->mmap = mmap_cache_new();
0284adc6
LP
2733 if (!f->mmap) {
2734 r = -ENOMEM;
2735 goto fail;
2736 }
2737 }
7560fffc 2738
0284adc6
LP
2739 f->path = strdup(fname);
2740 if (!f->path) {
2741 r = -ENOMEM;
2742 goto fail;
2743 }
7560fffc 2744
4743015d 2745 f->chain_cache = ordered_hashmap_new(&uint64_hash_ops);
a4bcff5b
LP
2746 if (!f->chain_cache) {
2747 r = -ENOMEM;
2748 goto fail;
2749 }
2750
0284adc6
LP
2751 f->fd = open(f->path, f->flags|O_CLOEXEC, f->mode);
2752 if (f->fd < 0) {
2753 r = -errno;
2754 goto fail;
7560fffc 2755 }
7560fffc 2756
2678031a
LP
2757 r = journal_file_fstat(f);
2758 if (r < 0)
0284adc6 2759 goto fail;
7560fffc 2760
0284adc6 2761 if (f->last_stat.st_size == 0 && f->writable) {
11689d2a 2762
fc68c929 2763 (void) journal_file_warn_btrfs(f);
11689d2a 2764
fb0951b0
LP
2765 /* Let's attach the creation time to the journal file,
2766 * so that the vacuuming code knows the age of this
2767 * file even if the file might end up corrupted one
2768 * day... Ideally we'd just use the creation time many
2769 * file systems maintain for each file, but there is
2770 * currently no usable API to query this, hence let's
2771 * emulate this via extended attributes. If extended
2772 * attributes are not supported we'll just skip this,
7517e174 2773 * and rely solely on mtime/atime/ctime of the file. */
fb0951b0 2774
d61b600d 2775 fd_setcrtime(f->fd, 0);
7560fffc 2776
feb12d3e 2777#ifdef HAVE_GCRYPT
0284adc6 2778 /* Try to load the FSPRG state, and if we can't, then
baed47c3 2779 * just don't do sealing */
49a32d43
LP
2780 if (f->seal) {
2781 r = journal_file_fss_load(f);
2782 if (r < 0)
2783 f->seal = false;
2784 }
feb12d3e 2785#endif
7560fffc 2786
0284adc6
LP
2787 r = journal_file_init_header(f, template);
2788 if (r < 0)
2789 goto fail;
7560fffc 2790
2678031a
LP
2791 r = journal_file_fstat(f);
2792 if (r < 0)
0284adc6 2793 goto fail;
fb0951b0
LP
2794
2795 newly_created = true;
0284adc6 2796 }
7560fffc 2797
0284adc6 2798 if (f->last_stat.st_size < (off_t) HEADER_SIZE_MIN) {
cfb571f3 2799 r = -ENODATA;
0284adc6
LP
2800 goto fail;
2801 }
7560fffc 2802
fa6ac760 2803 r = mmap_cache_get(f->mmap, f->fd, f->prot, CONTEXT_HEADER, true, 0, PAGE_ALIGN(sizeof(Header)), &f->last_stat, &h);
977eaa1e 2804 if (r < 0)
0284adc6 2805 goto fail;
7560fffc 2806
fa6ac760
LP
2807 f->header = h;
2808
0284adc6
LP
2809 if (!newly_created) {
2810 r = journal_file_verify_header(f);
2811 if (r < 0)
2812 goto fail;
2813 }
7560fffc 2814
feb12d3e 2815#ifdef HAVE_GCRYPT
0284adc6 2816 if (!newly_created && f->writable) {
baed47c3 2817 r = journal_file_fss_load(f);
0284adc6
LP
2818 if (r < 0)
2819 goto fail;
2820 }
feb12d3e 2821#endif
cec736d2
LP
2822
2823 if (f->writable) {
4a92baf3
LP
2824 if (metrics) {
2825 journal_default_metrics(metrics, f->fd);
2826 f->metrics = *metrics;
2827 } else if (template)
2828 f->metrics = template->metrics;
2829
cec736d2
LP
2830 r = journal_file_refresh_header(f);
2831 if (r < 0)
2832 goto fail;
2833 }
2834
feb12d3e 2835#ifdef HAVE_GCRYPT
baed47c3 2836 r = journal_file_hmac_setup(f);
14d10188
LP
2837 if (r < 0)
2838 goto fail;
feb12d3e 2839#endif
14d10188 2840
cec736d2 2841 if (newly_created) {
de190aef 2842 r = journal_file_setup_field_hash_table(f);
cec736d2
LP
2843 if (r < 0)
2844 goto fail;
2845
de190aef 2846 r = journal_file_setup_data_hash_table(f);
cec736d2
LP
2847 if (r < 0)
2848 goto fail;
7560fffc 2849
feb12d3e 2850#ifdef HAVE_GCRYPT
7560fffc
LP
2851 r = journal_file_append_first_tag(f);
2852 if (r < 0)
2853 goto fail;
feb12d3e 2854#endif
cec736d2
LP
2855 }
2856
fa6ac760
LP
2857 if (mmap_cache_got_sigbus(f->mmap, f->fd)) {
2858 r = -EIO;
2859 goto fail;
2860 }
2861
7a24f3bf 2862 if (template && template->post_change_timer) {
e167d7fd
LP
2863 r = journal_file_enable_post_change_timer(
2864 f,
2865 sd_event_source_get_event(template->post_change_timer),
2866 template->post_change_timer_period);
7a24f3bf 2867
7a24f3bf
VC
2868 if (r < 0)
2869 goto fail;
2870 }
2871
0559d3a5 2872 *ret = f;
cec736d2
LP
2873 return 0;
2874
2875fail:
fa6ac760
LP
2876 if (f->fd >= 0 && mmap_cache_got_sigbus(f->mmap, f->fd))
2877 r = -EIO;
2878
cec736d2
LP
2879 journal_file_close(f);
2880
2881 return r;
2882}
0ac38b70 2883
baed47c3 2884int journal_file_rotate(JournalFile **f, bool compress, bool seal) {
57535f47 2885 _cleanup_free_ char *p = NULL;
0ac38b70
LP
2886 size_t l;
2887 JournalFile *old_file, *new_file = NULL;
2888 int r;
2889
2890 assert(f);
2891 assert(*f);
2892
2893 old_file = *f;
2894
2895 if (!old_file->writable)
2896 return -EINVAL;
2897
2898 if (!endswith(old_file->path, ".journal"))
2899 return -EINVAL;
2900
2901 l = strlen(old_file->path);
57535f47
ZJS
2902 r = asprintf(&p, "%.*s@" SD_ID128_FORMAT_STR "-%016"PRIx64"-%016"PRIx64".journal",
2903 (int) l - 8, old_file->path,
2904 SD_ID128_FORMAT_VAL(old_file->header->seqnum_id),
2905 le64toh((*f)->header->head_entry_seqnum),
2906 le64toh((*f)->header->head_entry_realtime));
2907 if (r < 0)
0ac38b70
LP
2908 return -ENOMEM;
2909
2678031a
LP
2910 /* Try to rename the file to the archived version. If the file
2911 * already was deleted, we'll get ENOENT, let's ignore that
2912 * case. */
0ac38b70 2913 r = rename(old_file->path, p);
2678031a 2914 if (r < 0 && errno != ENOENT)
0ac38b70
LP
2915 return -errno;
2916
ccdbaf91 2917 old_file->header->state = STATE_ARCHIVED;
0ac38b70 2918
f27a3864
LP
2919 /* Currently, btrfs is not very good with out write patterns
2920 * and fragments heavily. Let's defrag our journal files when
2921 * we archive them */
2922 old_file->defrag_on_close = true;
2923
baed47c3 2924 r = journal_file_open(old_file->path, old_file->flags, old_file->mode, compress, seal, NULL, old_file->mmap, old_file, &new_file);
0ac38b70
LP
2925 journal_file_close(old_file);
2926
2927 *f = new_file;
2928 return r;
2929}
2930
9447a7f1
LP
2931int journal_file_open_reliably(
2932 const char *fname,
2933 int flags,
2934 mode_t mode,
7560fffc 2935 bool compress,
baed47c3 2936 bool seal,
4a92baf3 2937 JournalMetrics *metrics,
27370278 2938 MMapCache *mmap_cache,
9447a7f1
LP
2939 JournalFile *template,
2940 JournalFile **ret) {
2941
2942 int r;
2943 size_t l;
ed375beb 2944 _cleanup_free_ char *p = NULL;
9447a7f1 2945
070052ab 2946 r = journal_file_open(fname, flags, mode, compress, seal, metrics, mmap_cache, template, ret);
288359db
ZJS
2947 if (!IN_SET(r,
2948 -EBADMSG, /* corrupted */
2949 -ENODATA, /* truncated */
2950 -EHOSTDOWN, /* other machine */
2951 -EPROTONOSUPPORT, /* incompatible feature */
2952 -EBUSY, /* unclean shutdown */
2953 -ESHUTDOWN, /* already archived */
2954 -EIO, /* IO error, including SIGBUS on mmap */
2955 -EIDRM /* File has been deleted */))
9447a7f1
LP
2956 return r;
2957
2958 if ((flags & O_ACCMODE) == O_RDONLY)
2959 return r;
2960
2961 if (!(flags & O_CREAT))
2962 return r;
2963
7560fffc
LP
2964 if (!endswith(fname, ".journal"))
2965 return r;
2966
5c70eab4
LP
2967 /* The file is corrupted. Rotate it away and try it again (but only once) */
2968
9447a7f1 2969 l = strlen(fname);
d587eca5 2970 if (asprintf(&p, "%.*s@%016"PRIx64 "-%016"PRIx64 ".journal~",
57535f47 2971 (int) l - 8, fname,
d587eca5 2972 now(CLOCK_REALTIME),
9bf3b535 2973 random_u64()) < 0)
9447a7f1
LP
2974 return -ENOMEM;
2975
65089b82 2976 if (rename(fname, p) < 0)
9447a7f1
LP
2977 return -errno;
2978
f27a3864
LP
2979 /* btrfs doesn't cope well with our write pattern and
2980 * fragments heavily. Let's defrag all files we rotate */
11689d2a
LP
2981
2982 (void) chattr_path(p, false, FS_NOCOW_FL);
f27a3864
LP
2983 (void) btrfs_defrag(p);
2984
65089b82 2985 log_warning_errno(r, "File %s corrupted or uncleanly shut down, renaming and replacing.", fname);
9447a7f1 2986
070052ab 2987 return journal_file_open(fname, flags, mode, compress, seal, metrics, mmap_cache, template, ret);
9447a7f1
LP
2988}
2989
cf244689
LP
2990int journal_file_copy_entry(JournalFile *from, JournalFile *to, Object *o, uint64_t p, uint64_t *seqnum, Object **ret, uint64_t *offset) {
2991 uint64_t i, n;
2992 uint64_t q, xor_hash = 0;
2993 int r;
2994 EntryItem *items;
2995 dual_timestamp ts;
2996
2997 assert(from);
2998 assert(to);
2999 assert(o);
3000 assert(p);
3001
3002 if (!to->writable)
3003 return -EPERM;
3004
3005 ts.monotonic = le64toh(o->entry.monotonic);
3006 ts.realtime = le64toh(o->entry.realtime);
3007
cf244689 3008 n = journal_file_entry_n_items(o);
4faa7004
TA
3009 /* alloca() can't take 0, hence let's allocate at least one */
3010 items = alloca(sizeof(EntryItem) * MAX(1u, n));
cf244689
LP
3011
3012 for (i = 0; i < n; i++) {
4fd052ae
FC
3013 uint64_t l, h;
3014 le64_t le_hash;
cf244689
LP
3015 size_t t;
3016 void *data;
3017 Object *u;
3018
3019 q = le64toh(o->entry.items[i].object_offset);
3020 le_hash = o->entry.items[i].hash;
3021
3022 r = journal_file_move_to_object(from, OBJECT_DATA, q, &o);
3023 if (r < 0)
3024 return r;
3025
3026 if (le_hash != o->data.hash)
3027 return -EBADMSG;
3028
3029 l = le64toh(o->object.size) - offsetof(Object, data.payload);
3030 t = (size_t) l;
3031
3032 /* We hit the limit on 32bit machines */
3033 if ((uint64_t) t != l)
3034 return -E2BIG;
3035
d89c8fdf 3036 if (o->object.flags & OBJECT_COMPRESSION_MASK) {
3b1a55e1 3037#if defined(HAVE_XZ) || defined(HAVE_LZ4)
a7f7d1bd 3038 size_t rsize = 0;
cf244689 3039
d89c8fdf
ZJS
3040 r = decompress_blob(o->object.flags & OBJECT_COMPRESSION_MASK,
3041 o->data.payload, l, &from->compress_buffer, &from->compress_buffer_size, &rsize, 0);
3042 if (r < 0)
3043 return r;
cf244689
LP
3044
3045 data = from->compress_buffer;
3046 l = rsize;
3b1a55e1
ZJS
3047#else
3048 return -EPROTONOSUPPORT;
3049#endif
cf244689
LP
3050 } else
3051 data = o->data.payload;
3052
3053 r = journal_file_append_data(to, data, l, &u, &h);
3054 if (r < 0)
3055 return r;
3056
3057 xor_hash ^= le64toh(u->data.hash);
3058 items[i].object_offset = htole64(h);
3059 items[i].hash = u->data.hash;
3060
3061 r = journal_file_move_to_object(from, OBJECT_ENTRY, p, &o);
3062 if (r < 0)
3063 return r;
3064 }
3065
fa6ac760
LP
3066 r = journal_file_append_entry_internal(to, &ts, xor_hash, items, n, seqnum, ret, offset);
3067
3068 if (mmap_cache_got_sigbus(to->mmap, to->fd))
3069 return -EIO;
3070
3071 return r;
cf244689 3072}
babfc091 3073
8580d1f7
LP
3074void journal_reset_metrics(JournalMetrics *m) {
3075 assert(m);
3076
3077 /* Set everything to "pick automatic values". */
3078
3079 *m = (JournalMetrics) {
3080 .min_use = (uint64_t) -1,
3081 .max_use = (uint64_t) -1,
3082 .min_size = (uint64_t) -1,
3083 .max_size = (uint64_t) -1,
3084 .keep_free = (uint64_t) -1,
3085 .n_max_files = (uint64_t) -1,
3086 };
3087}
3088
babfc091 3089void journal_default_metrics(JournalMetrics *m, int fd) {
8580d1f7 3090 char a[FORMAT_BYTES_MAX], b[FORMAT_BYTES_MAX], c[FORMAT_BYTES_MAX], d[FORMAT_BYTES_MAX], e[FORMAT_BYTES_MAX];
babfc091 3091 struct statvfs ss;
8580d1f7 3092 uint64_t fs_size;
babfc091
LP
3093
3094 assert(m);
3095 assert(fd >= 0);
3096
3097 if (fstatvfs(fd, &ss) >= 0)
3098 fs_size = ss.f_frsize * ss.f_blocks;
8580d1f7
LP
3099 else {
3100 log_debug_errno(errno, "Failed to detremine disk size: %m");
3101 fs_size = 0;
3102 }
babfc091
LP
3103
3104 if (m->max_use == (uint64_t) -1) {
3105
3106 if (fs_size > 0) {
3107 m->max_use = PAGE_ALIGN(fs_size / 10); /* 10% of file system size */
3108
3109 if (m->max_use > DEFAULT_MAX_USE_UPPER)
3110 m->max_use = DEFAULT_MAX_USE_UPPER;
3111
3112 if (m->max_use < DEFAULT_MAX_USE_LOWER)
3113 m->max_use = DEFAULT_MAX_USE_LOWER;
3114 } else
3115 m->max_use = DEFAULT_MAX_USE_LOWER;
3116 } else {
3117 m->max_use = PAGE_ALIGN(m->max_use);
3118
8580d1f7 3119 if (m->max_use != 0 && m->max_use < JOURNAL_FILE_SIZE_MIN*2)
babfc091
LP
3120 m->max_use = JOURNAL_FILE_SIZE_MIN*2;
3121 }
3122
8580d1f7
LP
3123 if (m->min_use == (uint64_t) -1)
3124 m->min_use = DEFAULT_MIN_USE;
3125
3126 if (m->min_use > m->max_use)
3127 m->min_use = m->max_use;
3128
babfc091
LP
3129 if (m->max_size == (uint64_t) -1) {
3130 m->max_size = PAGE_ALIGN(m->max_use / 8); /* 8 chunks */
3131
3132 if (m->max_size > DEFAULT_MAX_SIZE_UPPER)
3133 m->max_size = DEFAULT_MAX_SIZE_UPPER;
3134 } else
3135 m->max_size = PAGE_ALIGN(m->max_size);
3136
8580d1f7
LP
3137 if (m->max_size != 0) {
3138 if (m->max_size < JOURNAL_FILE_SIZE_MIN)
3139 m->max_size = JOURNAL_FILE_SIZE_MIN;
babfc091 3140
8580d1f7
LP
3141 if (m->max_use != 0 && m->max_size*2 > m->max_use)
3142 m->max_use = m->max_size*2;
3143 }
babfc091
LP
3144
3145 if (m->min_size == (uint64_t) -1)
3146 m->min_size = JOURNAL_FILE_SIZE_MIN;
3147 else {
3148 m->min_size = PAGE_ALIGN(m->min_size);
3149
3150 if (m->min_size < JOURNAL_FILE_SIZE_MIN)
3151 m->min_size = JOURNAL_FILE_SIZE_MIN;
3152
8580d1f7 3153 if (m->max_size != 0 && m->min_size > m->max_size)
babfc091
LP
3154 m->max_size = m->min_size;
3155 }
3156
3157 if (m->keep_free == (uint64_t) -1) {
3158
3159 if (fs_size > 0) {
8621b110 3160 m->keep_free = PAGE_ALIGN(fs_size * 3 / 20); /* 15% of file system size */
babfc091
LP
3161
3162 if (m->keep_free > DEFAULT_KEEP_FREE_UPPER)
3163 m->keep_free = DEFAULT_KEEP_FREE_UPPER;
3164
3165 } else
3166 m->keep_free = DEFAULT_KEEP_FREE;
3167 }
3168
8580d1f7
LP
3169 if (m->n_max_files == (uint64_t) -1)
3170 m->n_max_files = DEFAULT_N_MAX_FILES;
3171
3172 log_debug("Fixed min_use=%s max_use=%s max_size=%s min_size=%s keep_free=%s n_max_files=%" PRIu64,
3173 format_bytes(a, sizeof(a), m->min_use),
3174 format_bytes(b, sizeof(b), m->max_use),
3175 format_bytes(c, sizeof(c), m->max_size),
3176 format_bytes(d, sizeof(d), m->min_size),
3177 format_bytes(e, sizeof(e), m->keep_free),
3178 m->n_max_files);
babfc091 3179}
08984293
LP
3180
3181int journal_file_get_cutoff_realtime_usec(JournalFile *f, usec_t *from, usec_t *to) {
08984293
LP
3182 assert(f);
3183 assert(from || to);
3184
3185 if (from) {
162566a4
LP
3186 if (f->header->head_entry_realtime == 0)
3187 return -ENOENT;
08984293 3188
162566a4 3189 *from = le64toh(f->header->head_entry_realtime);
08984293
LP
3190 }
3191
3192 if (to) {
162566a4
LP
3193 if (f->header->tail_entry_realtime == 0)
3194 return -ENOENT;
08984293 3195
162566a4 3196 *to = le64toh(f->header->tail_entry_realtime);
08984293
LP
3197 }
3198
3199 return 1;
3200}
3201
3202int journal_file_get_cutoff_monotonic_usec(JournalFile *f, sd_id128_t boot_id, usec_t *from, usec_t *to) {
08984293
LP
3203 Object *o;
3204 uint64_t p;
3205 int r;
3206
3207 assert(f);
3208 assert(from || to);
3209
47838ab3 3210 r = find_data_object_by_boot_id(f, boot_id, &o, &p);
08984293
LP
3211 if (r <= 0)
3212 return r;
3213
3214 if (le64toh(o->data.n_entries) <= 0)
3215 return 0;
3216
3217 if (from) {
3218 r = journal_file_move_to_object(f, OBJECT_ENTRY, le64toh(o->data.entry_offset), &o);
3219 if (r < 0)
3220 return r;
3221
3222 *from = le64toh(o->entry.monotonic);
3223 }
3224
3225 if (to) {
3226 r = journal_file_move_to_object(f, OBJECT_DATA, p, &o);
3227 if (r < 0)
3228 return r;
3229
3230 r = generic_array_get_plus_one(f,
3231 le64toh(o->data.entry_offset),
3232 le64toh(o->data.entry_array_offset),
3233 le64toh(o->data.n_entries)-1,
3234 &o, NULL);
3235 if (r <= 0)
3236 return r;
3237
3238 *to = le64toh(o->entry.monotonic);
3239 }
3240
3241 return 1;
3242}
dca6219e 3243
fb0951b0 3244bool journal_file_rotate_suggested(JournalFile *f, usec_t max_file_usec) {
dca6219e
LP
3245 assert(f);
3246
3247 /* If we gained new header fields we gained new features,
3248 * hence suggest a rotation */
361f9cbc
LP
3249 if (le64toh(f->header->header_size) < sizeof(Header)) {
3250 log_debug("%s uses an outdated header, suggesting rotation.", f->path);
dca6219e 3251 return true;
361f9cbc 3252 }
dca6219e
LP
3253
3254 /* Let's check if the hash tables grew over a certain fill
3255 * level (75%, borrowing this value from Java's hash table
3256 * implementation), and if so suggest a rotation. To calculate
3257 * the fill level we need the n_data field, which only exists
3258 * in newer versions. */
3259
3260 if (JOURNAL_HEADER_CONTAINS(f->header, n_data))
361f9cbc 3261 if (le64toh(f->header->n_data) * 4ULL > (le64toh(f->header->data_hash_table_size) / sizeof(HashItem)) * 3ULL) {
507f22bd 3262 log_debug("Data hash table of %s has a fill level at %.1f (%"PRIu64" of %"PRIu64" items, %llu file size, %"PRIu64" bytes per hash table item), suggesting rotation.",
361f9cbc
LP
3263 f->path,
3264 100.0 * (double) le64toh(f->header->n_data) / ((double) (le64toh(f->header->data_hash_table_size) / sizeof(HashItem))),
507f22bd
ZJS
3265 le64toh(f->header->n_data),
3266 le64toh(f->header->data_hash_table_size) / sizeof(HashItem),
3267 (unsigned long long) f->last_stat.st_size,
3268 f->last_stat.st_size / le64toh(f->header->n_data));
dca6219e 3269 return true;
361f9cbc 3270 }
dca6219e
LP
3271
3272 if (JOURNAL_HEADER_CONTAINS(f->header, n_fields))
361f9cbc 3273 if (le64toh(f->header->n_fields) * 4ULL > (le64toh(f->header->field_hash_table_size) / sizeof(HashItem)) * 3ULL) {
507f22bd 3274 log_debug("Field hash table of %s has a fill level at %.1f (%"PRIu64" of %"PRIu64" items), suggesting rotation.",
361f9cbc
LP
3275 f->path,
3276 100.0 * (double) le64toh(f->header->n_fields) / ((double) (le64toh(f->header->field_hash_table_size) / sizeof(HashItem))),
507f22bd
ZJS
3277 le64toh(f->header->n_fields),
3278 le64toh(f->header->field_hash_table_size) / sizeof(HashItem));
dca6219e 3279 return true;
361f9cbc 3280 }
dca6219e 3281
0598fd4a
LP
3282 /* Are the data objects properly indexed by field objects? */
3283 if (JOURNAL_HEADER_CONTAINS(f->header, n_data) &&
3284 JOURNAL_HEADER_CONTAINS(f->header, n_fields) &&
3285 le64toh(f->header->n_data) > 0 &&
3286 le64toh(f->header->n_fields) == 0)
3287 return true;
3288
fb0951b0
LP
3289 if (max_file_usec > 0) {
3290 usec_t t, h;
3291
3292 h = le64toh(f->header->head_entry_realtime);
3293 t = now(CLOCK_REALTIME);
3294
3295 if (h > 0 && t > h + max_file_usec)
3296 return true;
3297 }
3298
dca6219e
LP
3299 return false;
3300}