]> git.ipfire.org Git - thirdparty/systemd.git/blame - src/journal/journal-file.c
journald: keep statistics on how of we hit/miss the mmap cache
[thirdparty/systemd.git] / src / journal / journal-file.c
CommitLineData
cec736d2
LP
1/*-*- Mode: C; c-basic-offset: 8; indent-tabs-mode: nil -*-*/
2
3/***
4 This file is part of systemd.
5
6 Copyright 2011 Lennart Poettering
7
8 systemd is free software; you can redistribute it and/or modify it
5430f7f2
LP
9 under the terms of the GNU Lesser General Public License as published by
10 the Free Software Foundation; either version 2.1 of the License, or
cec736d2
LP
11 (at your option) any later version.
12
13 systemd is distributed in the hope that it will be useful, but
14 WITHOUT ANY WARRANTY; without even the implied warranty of
15 MERCHANTABILITY or FITNESS FOR A PARTICULAR PURPOSE. See the GNU
5430f7f2 16 Lesser General Public License for more details.
cec736d2 17
5430f7f2 18 You should have received a copy of the GNU Lesser General Public License
cec736d2
LP
19 along with systemd; If not, see <http://www.gnu.org/licenses/>.
20***/
21
22#include <sys/mman.h>
23#include <errno.h>
24#include <sys/uio.h>
25#include <unistd.h>
26#include <sys/statvfs.h>
27#include <fcntl.h>
28#include <stddef.h>
29
fb0951b0
LP
30#ifdef HAVE_XATTR
31#include <attr/xattr.h>
32#endif
33
cec736d2
LP
34#include "journal-def.h"
35#include "journal-file.h"
0284adc6 36#include "journal-authenticate.h"
cec736d2 37#include "lookup3.h"
807e17f0 38#include "compress.h"
7560fffc 39#include "fsprg.h"
cec736d2 40
4a92baf3
LP
41#define DEFAULT_DATA_HASH_TABLE_SIZE (2047ULL*sizeof(HashItem))
42#define DEFAULT_FIELD_HASH_TABLE_SIZE (333ULL*sizeof(HashItem))
cec736d2 43
be19b7df 44#define COMPRESSION_SIZE_THRESHOLD (512ULL)
807e17f0 45
babfc091 46/* This is the minimum journal file size */
253f59df 47#define JOURNAL_FILE_SIZE_MIN (4ULL*1024ULL*1024ULL) /* 4 MiB */
babfc091
LP
48
49/* These are the lower and upper bounds if we deduce the max_use value
50 * from the file system size */
51#define DEFAULT_MAX_USE_LOWER (1ULL*1024ULL*1024ULL) /* 1 MiB */
52#define DEFAULT_MAX_USE_UPPER (4ULL*1024ULL*1024ULL*1024ULL) /* 4 GiB */
53
54/* This is the upper bound if we deduce max_size from max_use */
71100051 55#define DEFAULT_MAX_SIZE_UPPER (128ULL*1024ULL*1024ULL) /* 128 MiB */
babfc091
LP
56
57/* This is the upper bound if we deduce the keep_free value from the
58 * file system size */
59#define DEFAULT_KEEP_FREE_UPPER (4ULL*1024ULL*1024ULL*1024ULL) /* 4 GiB */
60
61/* This is the keep_free value when we can't determine the system
62 * size */
63#define DEFAULT_KEEP_FREE (1024ULL*1024ULL) /* 1 MB */
64
dca6219e
LP
65/* n_data was the first entry we added after the initial file format design */
66#define HEADER_SIZE_MIN ALIGN64(offsetof(Header, n_data))
cec736d2 67
a4bcff5b
LP
68/* How many entries to keep in the entry array chain cache at max */
69#define CHAIN_CACHE_MAX 20
70
a676e665
LP
71/* How much to increase the journal file size at once each time we allocate something new. */
72#define FILE_SIZE_INCREASE (8ULL*1024ULL*1024ULL) /* 8MB */
73
9588bc32 74static int journal_file_set_online(JournalFile *f) {
26687bf8
OS
75 assert(f);
76
77 if (!f->writable)
78 return -EPERM;
79
80 if (!(f->fd >= 0 && f->header))
81 return -EINVAL;
82
83 switch(f->header->state) {
84 case STATE_ONLINE:
85 return 0;
86
87 case STATE_OFFLINE:
88 f->header->state = STATE_ONLINE;
89 fsync(f->fd);
90 return 0;
91
92 default:
93 return -EINVAL;
94 }
95}
96
97int journal_file_set_offline(JournalFile *f) {
98 assert(f);
99
100 if (!f->writable)
101 return -EPERM;
102
103 if (!(f->fd >= 0 && f->header))
104 return -EINVAL;
105
106 if (f->header->state != STATE_ONLINE)
107 return 0;
108
109 fsync(f->fd);
110
111 f->header->state = STATE_OFFLINE;
112
113 fsync(f->fd);
114
115 return 0;
116}
117
cec736d2 118void journal_file_close(JournalFile *f) {
de190aef 119 assert(f);
cec736d2 120
feb12d3e 121#ifdef HAVE_GCRYPT
b0af6f41 122 /* Write the final tag */
c586dbf1 123 if (f->seal && f->writable)
b0af6f41 124 journal_file_append_tag(f);
feb12d3e 125#endif
b0af6f41 126
7560fffc 127 /* Sync everything to disk, before we mark the file offline */
16e9f408
LP
128 if (f->mmap && f->fd >= 0)
129 mmap_cache_close_fd(f->mmap, f->fd);
7560fffc 130
26687bf8 131 journal_file_set_offline(f);
cec736d2 132
26687bf8 133 if (f->header)
d384c7a8 134 munmap(f->header, PAGE_ALIGN(sizeof(Header)));
cec736d2 135
0ac38b70
LP
136 if (f->fd >= 0)
137 close_nointr_nofail(f->fd);
138
cec736d2 139 free(f->path);
807e17f0 140
16e9f408
LP
141 if (f->mmap)
142 mmap_cache_unref(f->mmap);
143
a4bcff5b
LP
144 hashmap_free_free(f->chain_cache);
145
807e17f0
LP
146#ifdef HAVE_XZ
147 free(f->compress_buffer);
148#endif
149
7560fffc 150#ifdef HAVE_GCRYPT
baed47c3
LP
151 if (f->fss_file)
152 munmap(f->fss_file, PAGE_ALIGN(f->fss_file_size));
b7c9ae91
LP
153 else if (f->fsprg_state)
154 free(f->fsprg_state);
155
156 free(f->fsprg_seed);
7560fffc
LP
157
158 if (f->hmac)
159 gcry_md_close(f->hmac);
160#endif
161
cec736d2
LP
162 free(f);
163}
164
0ac38b70 165static int journal_file_init_header(JournalFile *f, JournalFile *template) {
cec736d2
LP
166 Header h;
167 ssize_t k;
168 int r;
169
170 assert(f);
171
172 zero(h);
7560fffc 173 memcpy(h.signature, HEADER_SIGNATURE, 8);
23b0b2b2 174 h.header_size = htole64(ALIGN64(sizeof(h)));
cec736d2 175
7560fffc
LP
176 h.incompatible_flags =
177 htole32(f->compress ? HEADER_INCOMPATIBLE_COMPRESSED : 0);
178
179 h.compatible_flags =
baed47c3 180 htole32(f->seal ? HEADER_COMPATIBLE_SEALED : 0);
7560fffc 181
cec736d2
LP
182 r = sd_id128_randomize(&h.file_id);
183 if (r < 0)
184 return r;
185
0ac38b70
LP
186 if (template) {
187 h.seqnum_id = template->header->seqnum_id;
beec0085 188 h.tail_entry_seqnum = template->header->tail_entry_seqnum;
0ac38b70
LP
189 } else
190 h.seqnum_id = h.file_id;
cec736d2
LP
191
192 k = pwrite(f->fd, &h, sizeof(h), 0);
193 if (k < 0)
194 return -errno;
195
196 if (k != sizeof(h))
197 return -EIO;
198
199 return 0;
200}
201
202static int journal_file_refresh_header(JournalFile *f) {
203 int r;
de190aef 204 sd_id128_t boot_id;
cec736d2
LP
205
206 assert(f);
207
208 r = sd_id128_get_machine(&f->header->machine_id);
209 if (r < 0)
210 return r;
211
de190aef 212 r = sd_id128_get_boot(&boot_id);
cec736d2
LP
213 if (r < 0)
214 return r;
215
de190aef
LP
216 if (sd_id128_equal(boot_id, f->header->boot_id))
217 f->tail_entry_monotonic_valid = true;
218
219 f->header->boot_id = boot_id;
220
26687bf8 221 journal_file_set_online(f);
b788cc23 222
7560fffc 223 /* Sync the online state to disk */
a676e665 224 fsync(f->fd);
b788cc23 225
cec736d2
LP
226 return 0;
227}
228
229static int journal_file_verify_header(JournalFile *f) {
230 assert(f);
231
7560fffc 232 if (memcmp(f->header->signature, HEADER_SIGNATURE, 8))
cec736d2
LP
233 return -EBADMSG;
234
7560fffc
LP
235 /* In both read and write mode we refuse to open files with
236 * incompatible flags we don't know */
807e17f0 237#ifdef HAVE_XZ
7560fffc 238 if ((le32toh(f->header->incompatible_flags) & ~HEADER_INCOMPATIBLE_COMPRESSED) != 0)
807e17f0
LP
239 return -EPROTONOSUPPORT;
240#else
cec736d2
LP
241 if (f->header->incompatible_flags != 0)
242 return -EPROTONOSUPPORT;
807e17f0 243#endif
cec736d2 244
7560fffc
LP
245 /* When open for writing we refuse to open files with
246 * compatible flags, too */
247 if (f->writable) {
248#ifdef HAVE_GCRYPT
baed47c3 249 if ((le32toh(f->header->compatible_flags) & ~HEADER_COMPATIBLE_SEALED) != 0)
7560fffc
LP
250 return -EPROTONOSUPPORT;
251#else
252 if (f->header->compatible_flags != 0)
253 return -EPROTONOSUPPORT;
254#endif
255 }
256
db11ac1a
LP
257 if (f->header->state >= _STATE_MAX)
258 return -EBADMSG;
259
dca6219e
LP
260 /* The first addition was n_data, so check that we are at least this large */
261 if (le64toh(f->header->header_size) < HEADER_SIZE_MIN)
23b0b2b2
LP
262 return -EBADMSG;
263
8088cbd3 264 if (JOURNAL_HEADER_SEALED(f->header) && !JOURNAL_HEADER_CONTAINS(f->header, n_entry_arrays))
beec0085
LP
265 return -EBADMSG;
266
db11ac1a
LP
267 if ((le64toh(f->header->header_size) + le64toh(f->header->arena_size)) > (uint64_t) f->last_stat.st_size)
268 return -ENODATA;
269
270 if (le64toh(f->header->tail_object_offset) > (le64toh(f->header->header_size) + le64toh(f->header->arena_size)))
271 return -ENODATA;
272
7762e02b
LP
273 if (!VALID64(le64toh(f->header->data_hash_table_offset)) ||
274 !VALID64(le64toh(f->header->field_hash_table_offset)) ||
275 !VALID64(le64toh(f->header->tail_object_offset)) ||
276 !VALID64(le64toh(f->header->entry_array_offset)))
277 return -ENODATA;
278
279 if (le64toh(f->header->data_hash_table_offset) < le64toh(f->header->header_size) ||
280 le64toh(f->header->field_hash_table_offset) < le64toh(f->header->header_size) ||
281 le64toh(f->header->tail_object_offset) < le64toh(f->header->header_size) ||
282 le64toh(f->header->entry_array_offset) < le64toh(f->header->header_size))
cec736d2
LP
283 return -ENODATA;
284
285 if (f->writable) {
ccdbaf91 286 uint8_t state;
cec736d2
LP
287 sd_id128_t machine_id;
288 int r;
289
290 r = sd_id128_get_machine(&machine_id);
291 if (r < 0)
292 return r;
293
294 if (!sd_id128_equal(machine_id, f->header->machine_id))
295 return -EHOSTDOWN;
296
de190aef 297 state = f->header->state;
cec736d2 298
71fa6f00
LP
299 if (state == STATE_ONLINE) {
300 log_debug("Journal file %s is already online. Assuming unclean closing.", f->path);
301 return -EBUSY;
302 } else if (state == STATE_ARCHIVED)
cec736d2 303 return -ESHUTDOWN;
71fa6f00
LP
304 else if (state != STATE_OFFLINE) {
305 log_debug("Journal file %s has unknown state %u.", f->path, state);
306 return -EBUSY;
307 }
cec736d2
LP
308 }
309
8088cbd3 310 f->compress = JOURNAL_HEADER_COMPRESSED(f->header);
c586dbf1 311
f1889c91 312 f->seal = JOURNAL_HEADER_SEALED(f->header);
7560fffc 313
cec736d2
LP
314 return 0;
315}
316
317static int journal_file_allocate(JournalFile *f, uint64_t offset, uint64_t size) {
a676e665 318 uint64_t old_size, new_size, file_size;
fec2aa2f 319 int r;
cec736d2
LP
320
321 assert(f);
322
cec736d2 323 /* We assume that this file is not sparse, and we know that
38ac38b2 324 * for sure, since we always call posix_fallocate()
cec736d2
LP
325 * ourselves */
326
327 old_size =
23b0b2b2 328 le64toh(f->header->header_size) +
cec736d2
LP
329 le64toh(f->header->arena_size);
330
bc85bfee 331 new_size = PAGE_ALIGN(offset + size);
23b0b2b2
LP
332 if (new_size < le64toh(f->header->header_size))
333 new_size = le64toh(f->header->header_size);
bc85bfee
LP
334
335 if (new_size <= old_size)
cec736d2
LP
336 return 0;
337
a676e665 338 if (f->metrics.max_size > 0 && new_size > f->metrics.max_size)
bc85bfee 339 return -E2BIG;
cec736d2 340
a676e665 341 if (new_size > f->metrics.min_size && f->metrics.keep_free > 0) {
cec736d2
LP
342 struct statvfs svfs;
343
344 if (fstatvfs(f->fd, &svfs) >= 0) {
345 uint64_t available;
346
347 available = svfs.f_bfree * svfs.f_bsize;
348
bc85bfee
LP
349 if (available >= f->metrics.keep_free)
350 available -= f->metrics.keep_free;
cec736d2
LP
351 else
352 available = 0;
353
354 if (new_size - old_size > available)
355 return -E2BIG;
356 }
357 }
358
bc85bfee
LP
359 /* Note that the glibc fallocate() fallback is very
360 inefficient, hence we try to minimize the allocation area
361 as we can. */
fec2aa2f
GV
362 r = posix_fallocate(f->fd, old_size, new_size - old_size);
363 if (r != 0)
364 return -r;
cec736d2 365
a676e665
LP
366 /* Increase the file size a bit further than this, so that we
367 * we can create larger memory maps to cache */
368 file_size = ((new_size+FILE_SIZE_INCREASE-1) / FILE_SIZE_INCREASE) * FILE_SIZE_INCREASE;
369 if (file_size > (uint64_t) f->last_stat.st_size) {
370 if (file_size > new_size)
371 ftruncate(f->fd, file_size);
372
373 if (fstat(f->fd, &f->last_stat) < 0)
374 return -errno;
375 }
cec736d2 376
23b0b2b2 377 f->header->arena_size = htole64(new_size - le64toh(f->header->header_size));
cec736d2
LP
378
379 return 0;
380}
381
fcde2389 382static int journal_file_move_to(JournalFile *f, int context, bool keep_always, uint64_t offset, uint64_t size, void **ret) {
cec736d2 383 assert(f);
cec736d2
LP
384 assert(ret);
385
7762e02b
LP
386 if (size <= 0)
387 return -EINVAL;
388
2a59ea54 389 /* Avoid SIGBUS on invalid accesses */
4bbdcdb3
LP
390 if (offset + size > (uint64_t) f->last_stat.st_size) {
391 /* Hmm, out of range? Let's refresh the fstat() data
392 * first, before we trust that check. */
393
394 if (fstat(f->fd, &f->last_stat) < 0 ||
395 offset + size > (uint64_t) f->last_stat.st_size)
396 return -EADDRNOTAVAIL;
397 }
398
fcde2389 399 return mmap_cache_get(f->mmap, f->fd, f->prot, context, keep_always, offset, size, &f->last_stat, ret);
cec736d2
LP
400}
401
16e9f408
LP
402static uint64_t minimum_header_size(Object *o) {
403
b8e891e6 404 static const uint64_t table[] = {
16e9f408
LP
405 [OBJECT_DATA] = sizeof(DataObject),
406 [OBJECT_FIELD] = sizeof(FieldObject),
407 [OBJECT_ENTRY] = sizeof(EntryObject),
408 [OBJECT_DATA_HASH_TABLE] = sizeof(HashTableObject),
409 [OBJECT_FIELD_HASH_TABLE] = sizeof(HashTableObject),
410 [OBJECT_ENTRY_ARRAY] = sizeof(EntryArrayObject),
411 [OBJECT_TAG] = sizeof(TagObject),
412 };
413
414 if (o->object.type >= ELEMENTSOF(table) || table[o->object.type] <= 0)
415 return sizeof(ObjectHeader);
416
417 return table[o->object.type];
418}
419
de190aef 420int journal_file_move_to_object(JournalFile *f, int type, uint64_t offset, Object **ret) {
cec736d2
LP
421 int r;
422 void *t;
423 Object *o;
424 uint64_t s;
16e9f408 425 unsigned context;
cec736d2
LP
426
427 assert(f);
428 assert(ret);
429
db11ac1a
LP
430 /* Objects may only be located at multiple of 64 bit */
431 if (!VALID64(offset))
432 return -EFAULT;
433
16e9f408
LP
434 /* One context for each type, plus one catch-all for the rest */
435 context = type > 0 && type < _OBJECT_TYPE_MAX ? type : 0;
436
fcde2389 437 r = journal_file_move_to(f, context, false, offset, sizeof(ObjectHeader), &t);
cec736d2
LP
438 if (r < 0)
439 return r;
440
441 o = (Object*) t;
442 s = le64toh(o->object.size);
443
444 if (s < sizeof(ObjectHeader))
445 return -EBADMSG;
446
16e9f408
LP
447 if (o->object.type <= OBJECT_UNUSED)
448 return -EBADMSG;
449
450 if (s < minimum_header_size(o))
451 return -EBADMSG;
452
3c1668da 453 if (type > 0 && o->object.type != type)
cec736d2
LP
454 return -EBADMSG;
455
456 if (s > sizeof(ObjectHeader)) {
fcde2389 457 r = journal_file_move_to(f, o->object.type, false, offset, s, &t);
cec736d2
LP
458 if (r < 0)
459 return r;
460
461 o = (Object*) t;
462 }
463
cec736d2
LP
464 *ret = o;
465 return 0;
466}
467
d98cc1f2 468static uint64_t journal_file_entry_seqnum(JournalFile *f, uint64_t *seqnum) {
cec736d2
LP
469 uint64_t r;
470
471 assert(f);
472
beec0085 473 r = le64toh(f->header->tail_entry_seqnum) + 1;
c2373f84
LP
474
475 if (seqnum) {
de190aef 476 /* If an external seqnum counter was passed, we update
c2373f84
LP
477 * both the local and the external one, and set it to
478 * the maximum of both */
479
480 if (*seqnum + 1 > r)
481 r = *seqnum + 1;
482
483 *seqnum = r;
484 }
485
beec0085 486 f->header->tail_entry_seqnum = htole64(r);
cec736d2 487
beec0085
LP
488 if (f->header->head_entry_seqnum == 0)
489 f->header->head_entry_seqnum = htole64(r);
de190aef 490
cec736d2
LP
491 return r;
492}
493
0284adc6 494int journal_file_append_object(JournalFile *f, int type, uint64_t size, Object **ret, uint64_t *offset) {
cec736d2
LP
495 int r;
496 uint64_t p;
497 Object *tail, *o;
498 void *t;
499
500 assert(f);
16e9f408 501 assert(type > 0 && type < _OBJECT_TYPE_MAX);
cec736d2
LP
502 assert(size >= sizeof(ObjectHeader));
503 assert(offset);
504 assert(ret);
505
26687bf8
OS
506 r = journal_file_set_online(f);
507 if (r < 0)
508 return r;
509
cec736d2 510 p = le64toh(f->header->tail_object_offset);
cec736d2 511 if (p == 0)
23b0b2b2 512 p = le64toh(f->header->header_size);
cec736d2 513 else {
de190aef 514 r = journal_file_move_to_object(f, -1, p, &tail);
cec736d2
LP
515 if (r < 0)
516 return r;
517
518 p += ALIGN64(le64toh(tail->object.size));
519 }
520
521 r = journal_file_allocate(f, p, size);
522 if (r < 0)
523 return r;
524
fcde2389 525 r = journal_file_move_to(f, type, false, p, size, &t);
cec736d2
LP
526 if (r < 0)
527 return r;
528
529 o = (Object*) t;
530
531 zero(o->object);
de190aef 532 o->object.type = type;
cec736d2
LP
533 o->object.size = htole64(size);
534
535 f->header->tail_object_offset = htole64(p);
cec736d2
LP
536 f->header->n_objects = htole64(le64toh(f->header->n_objects) + 1);
537
538 *ret = o;
539 *offset = p;
540
541 return 0;
542}
543
de190aef 544static int journal_file_setup_data_hash_table(JournalFile *f) {
cec736d2
LP
545 uint64_t s, p;
546 Object *o;
547 int r;
548
549 assert(f);
550
dfabe643 551 /* We estimate that we need 1 hash table entry per 768 of
4a92baf3
LP
552 journal file and we want to make sure we never get beyond
553 75% fill level. Calculate the hash table size for the
554 maximum file size based on these metrics. */
555
dfabe643 556 s = (f->metrics.max_size * 4 / 768 / 3) * sizeof(HashItem);
4a92baf3
LP
557 if (s < DEFAULT_DATA_HASH_TABLE_SIZE)
558 s = DEFAULT_DATA_HASH_TABLE_SIZE;
559
507f22bd 560 log_debug("Reserving %"PRIu64" entries in hash table.", s / sizeof(HashItem));
4a92baf3 561
de190aef
LP
562 r = journal_file_append_object(f,
563 OBJECT_DATA_HASH_TABLE,
564 offsetof(Object, hash_table.items) + s,
565 &o, &p);
cec736d2
LP
566 if (r < 0)
567 return r;
568
de190aef 569 memset(o->hash_table.items, 0, s);
cec736d2 570
de190aef
LP
571 f->header->data_hash_table_offset = htole64(p + offsetof(Object, hash_table.items));
572 f->header->data_hash_table_size = htole64(s);
cec736d2
LP
573
574 return 0;
575}
576
de190aef 577static int journal_file_setup_field_hash_table(JournalFile *f) {
cec736d2
LP
578 uint64_t s, p;
579 Object *o;
580 int r;
581
582 assert(f);
583
3c1668da
LP
584 /* We use a fixed size hash table for the fields as this
585 * number should grow very slowly only */
586
de190aef
LP
587 s = DEFAULT_FIELD_HASH_TABLE_SIZE;
588 r = journal_file_append_object(f,
589 OBJECT_FIELD_HASH_TABLE,
590 offsetof(Object, hash_table.items) + s,
591 &o, &p);
cec736d2
LP
592 if (r < 0)
593 return r;
594
de190aef 595 memset(o->hash_table.items, 0, s);
cec736d2 596
de190aef
LP
597 f->header->field_hash_table_offset = htole64(p + offsetof(Object, hash_table.items));
598 f->header->field_hash_table_size = htole64(s);
cec736d2
LP
599
600 return 0;
601}
602
de190aef 603static int journal_file_map_data_hash_table(JournalFile *f) {
cec736d2
LP
604 uint64_t s, p;
605 void *t;
606 int r;
607
608 assert(f);
609
de190aef
LP
610 p = le64toh(f->header->data_hash_table_offset);
611 s = le64toh(f->header->data_hash_table_size);
cec736d2 612
de190aef 613 r = journal_file_move_to(f,
16e9f408 614 OBJECT_DATA_HASH_TABLE,
fcde2389 615 true,
de190aef
LP
616 p, s,
617 &t);
cec736d2
LP
618 if (r < 0)
619 return r;
620
de190aef 621 f->data_hash_table = t;
cec736d2
LP
622 return 0;
623}
624
de190aef 625static int journal_file_map_field_hash_table(JournalFile *f) {
cec736d2
LP
626 uint64_t s, p;
627 void *t;
628 int r;
629
630 assert(f);
631
de190aef
LP
632 p = le64toh(f->header->field_hash_table_offset);
633 s = le64toh(f->header->field_hash_table_size);
cec736d2 634
de190aef 635 r = journal_file_move_to(f,
16e9f408 636 OBJECT_FIELD_HASH_TABLE,
fcde2389 637 true,
de190aef
LP
638 p, s,
639 &t);
cec736d2
LP
640 if (r < 0)
641 return r;
642
de190aef 643 f->field_hash_table = t;
cec736d2
LP
644 return 0;
645}
646
3c1668da
LP
647static int journal_file_link_field(
648 JournalFile *f,
649 Object *o,
650 uint64_t offset,
651 uint64_t hash) {
652
653 uint64_t p, h;
654 int r;
655
656 assert(f);
657 assert(o);
658 assert(offset > 0);
659
660 if (o->object.type != OBJECT_FIELD)
661 return -EINVAL;
662
663 /* This might alter the window we are looking at */
664
665 o->field.next_hash_offset = o->field.head_data_offset = 0;
666
667 h = hash % (le64toh(f->header->field_hash_table_size) / sizeof(HashItem));
668 p = le64toh(f->field_hash_table[h].tail_hash_offset);
669 if (p == 0)
670 f->field_hash_table[h].head_hash_offset = htole64(offset);
671 else {
672 r = journal_file_move_to_object(f, OBJECT_FIELD, p, &o);
673 if (r < 0)
674 return r;
675
676 o->field.next_hash_offset = htole64(offset);
677 }
678
679 f->field_hash_table[h].tail_hash_offset = htole64(offset);
680
681 if (JOURNAL_HEADER_CONTAINS(f->header, n_fields))
682 f->header->n_fields = htole64(le64toh(f->header->n_fields) + 1);
683
684 return 0;
685}
686
687static int journal_file_link_data(
688 JournalFile *f,
689 Object *o,
690 uint64_t offset,
691 uint64_t hash) {
692
de190aef 693 uint64_t p, h;
cec736d2
LP
694 int r;
695
696 assert(f);
697 assert(o);
698 assert(offset > 0);
b588975f
LP
699
700 if (o->object.type != OBJECT_DATA)
701 return -EINVAL;
cec736d2 702
48496df6
LP
703 /* This might alter the window we are looking at */
704
de190aef
LP
705 o->data.next_hash_offset = o->data.next_field_offset = 0;
706 o->data.entry_offset = o->data.entry_array_offset = 0;
707 o->data.n_entries = 0;
cec736d2 708
de190aef 709 h = hash % (le64toh(f->header->data_hash_table_size) / sizeof(HashItem));
8db4213e 710 p = le64toh(f->data_hash_table[h].tail_hash_offset);
3c1668da 711 if (p == 0)
cec736d2 712 /* Only entry in the hash table is easy */
de190aef 713 f->data_hash_table[h].head_hash_offset = htole64(offset);
3c1668da 714 else {
48496df6
LP
715 /* Move back to the previous data object, to patch in
716 * pointer */
cec736d2 717
de190aef 718 r = journal_file_move_to_object(f, OBJECT_DATA, p, &o);
cec736d2
LP
719 if (r < 0)
720 return r;
721
de190aef 722 o->data.next_hash_offset = htole64(offset);
cec736d2
LP
723 }
724
de190aef 725 f->data_hash_table[h].tail_hash_offset = htole64(offset);
cec736d2 726
dca6219e
LP
727 if (JOURNAL_HEADER_CONTAINS(f->header, n_data))
728 f->header->n_data = htole64(le64toh(f->header->n_data) + 1);
729
cec736d2
LP
730 return 0;
731}
732
3c1668da
LP
733int journal_file_find_field_object_with_hash(
734 JournalFile *f,
735 const void *field, uint64_t size, uint64_t hash,
736 Object **ret, uint64_t *offset) {
737
738 uint64_t p, osize, h;
739 int r;
740
741 assert(f);
742 assert(field && size > 0);
743
744 osize = offsetof(Object, field.payload) + size;
745
746 if (f->header->field_hash_table_size == 0)
747 return -EBADMSG;
748
749 h = hash % (le64toh(f->header->field_hash_table_size) / sizeof(HashItem));
750 p = le64toh(f->field_hash_table[h].head_hash_offset);
751
752 while (p > 0) {
753 Object *o;
754
755 r = journal_file_move_to_object(f, OBJECT_FIELD, p, &o);
756 if (r < 0)
757 return r;
758
759 if (le64toh(o->field.hash) == hash &&
760 le64toh(o->object.size) == osize &&
761 memcmp(o->field.payload, field, size) == 0) {
762
763 if (ret)
764 *ret = o;
765 if (offset)
766 *offset = p;
767
768 return 1;
769 }
770
771 p = le64toh(o->field.next_hash_offset);
772 }
773
774 return 0;
775}
776
777int journal_file_find_field_object(
778 JournalFile *f,
779 const void *field, uint64_t size,
780 Object **ret, uint64_t *offset) {
781
782 uint64_t hash;
783
784 assert(f);
785 assert(field && size > 0);
786
787 hash = hash64(field, size);
788
789 return journal_file_find_field_object_with_hash(f,
790 field, size, hash,
791 ret, offset);
792}
793
de190aef
LP
794int journal_file_find_data_object_with_hash(
795 JournalFile *f,
796 const void *data, uint64_t size, uint64_t hash,
797 Object **ret, uint64_t *offset) {
48496df6 798
de190aef 799 uint64_t p, osize, h;
cec736d2
LP
800 int r;
801
802 assert(f);
803 assert(data || size == 0);
804
805 osize = offsetof(Object, data.payload) + size;
806
bc85bfee
LP
807 if (f->header->data_hash_table_size == 0)
808 return -EBADMSG;
809
de190aef
LP
810 h = hash % (le64toh(f->header->data_hash_table_size) / sizeof(HashItem));
811 p = le64toh(f->data_hash_table[h].head_hash_offset);
cec736d2 812
de190aef
LP
813 while (p > 0) {
814 Object *o;
cec736d2 815
de190aef 816 r = journal_file_move_to_object(f, OBJECT_DATA, p, &o);
cec736d2
LP
817 if (r < 0)
818 return r;
819
807e17f0 820 if (le64toh(o->data.hash) != hash)
85a131e8 821 goto next;
807e17f0
LP
822
823 if (o->object.flags & OBJECT_COMPRESSED) {
824#ifdef HAVE_XZ
b785c858 825 uint64_t l, rsize;
cec736d2 826
807e17f0
LP
827 l = le64toh(o->object.size);
828 if (l <= offsetof(Object, data.payload))
cec736d2
LP
829 return -EBADMSG;
830
807e17f0
LP
831 l -= offsetof(Object, data.payload);
832
93b73b06 833 if (!uncompress_blob(o->data.payload, l, &f->compress_buffer, &f->compress_buffer_size, &rsize, 0))
807e17f0
LP
834 return -EBADMSG;
835
b785c858 836 if (rsize == size &&
807e17f0
LP
837 memcmp(f->compress_buffer, data, size) == 0) {
838
839 if (ret)
840 *ret = o;
841
842 if (offset)
843 *offset = p;
844
845 return 1;
846 }
847#else
848 return -EPROTONOSUPPORT;
849#endif
850
851 } else if (le64toh(o->object.size) == osize &&
852 memcmp(o->data.payload, data, size) == 0) {
853
cec736d2
LP
854 if (ret)
855 *ret = o;
856
857 if (offset)
858 *offset = p;
859
de190aef 860 return 1;
cec736d2
LP
861 }
862
85a131e8 863 next:
cec736d2
LP
864 p = le64toh(o->data.next_hash_offset);
865 }
866
de190aef
LP
867 return 0;
868}
869
870int journal_file_find_data_object(
871 JournalFile *f,
872 const void *data, uint64_t size,
873 Object **ret, uint64_t *offset) {
874
875 uint64_t hash;
876
877 assert(f);
878 assert(data || size == 0);
879
880 hash = hash64(data, size);
881
882 return journal_file_find_data_object_with_hash(f,
883 data, size, hash,
884 ret, offset);
885}
886
3c1668da
LP
887static int journal_file_append_field(
888 JournalFile *f,
889 const void *field, uint64_t size,
890 Object **ret, uint64_t *offset) {
891
892 uint64_t hash, p;
893 uint64_t osize;
894 Object *o;
895 int r;
896
897 assert(f);
898 assert(field && size > 0);
899
900 hash = hash64(field, size);
901
902 r = journal_file_find_field_object_with_hash(f, field, size, hash, &o, &p);
903 if (r < 0)
904 return r;
905 else if (r > 0) {
906
907 if (ret)
908 *ret = o;
909
910 if (offset)
911 *offset = p;
912
913 return 0;
914 }
915
916 osize = offsetof(Object, field.payload) + size;
917 r = journal_file_append_object(f, OBJECT_FIELD, osize, &o, &p);
8c92d4bb
LP
918 if (r < 0)
919 return r;
3c1668da
LP
920
921 o->field.hash = htole64(hash);
922 memcpy(o->field.payload, field, size);
923
924 r = journal_file_link_field(f, o, p, hash);
925 if (r < 0)
926 return r;
927
928 /* The linking might have altered the window, so let's
929 * refresh our pointer */
930 r = journal_file_move_to_object(f, OBJECT_FIELD, p, &o);
931 if (r < 0)
932 return r;
933
934#ifdef HAVE_GCRYPT
935 r = journal_file_hmac_put_object(f, OBJECT_FIELD, o, p);
936 if (r < 0)
937 return r;
938#endif
939
940 if (ret)
941 *ret = o;
942
943 if (offset)
944 *offset = p;
945
946 return 0;
947}
948
48496df6
LP
949static int journal_file_append_data(
950 JournalFile *f,
951 const void *data, uint64_t size,
952 Object **ret, uint64_t *offset) {
953
de190aef
LP
954 uint64_t hash, p;
955 uint64_t osize;
956 Object *o;
957 int r;
807e17f0 958 bool compressed = false;
3c1668da 959 const void *eq;
de190aef
LP
960
961 assert(f);
962 assert(data || size == 0);
963
964 hash = hash64(data, size);
965
966 r = journal_file_find_data_object_with_hash(f, data, size, hash, &o, &p);
967 if (r < 0)
968 return r;
969 else if (r > 0) {
970
971 if (ret)
972 *ret = o;
973
974 if (offset)
975 *offset = p;
976
977 return 0;
978 }
979
980 osize = offsetof(Object, data.payload) + size;
981 r = journal_file_append_object(f, OBJECT_DATA, osize, &o, &p);
cec736d2
LP
982 if (r < 0)
983 return r;
984
cec736d2 985 o->data.hash = htole64(hash);
807e17f0
LP
986
987#ifdef HAVE_XZ
988 if (f->compress &&
989 size >= COMPRESSION_SIZE_THRESHOLD) {
990 uint64_t rsize;
991
992 compressed = compress_blob(data, size, o->data.payload, &rsize);
993
994 if (compressed) {
995 o->object.size = htole64(offsetof(Object, data.payload) + rsize);
996 o->object.flags |= OBJECT_COMPRESSED;
997
507f22bd 998 log_debug("Compressed data object %"PRIu64" -> %"PRIu64, size, rsize);
807e17f0
LP
999 }
1000 }
1001#endif
1002
64825d3c 1003 if (!compressed && size > 0)
807e17f0 1004 memcpy(o->data.payload, data, size);
cec736d2 1005
de190aef 1006 r = journal_file_link_data(f, o, p, hash);
cec736d2
LP
1007 if (r < 0)
1008 return r;
1009
48496df6
LP
1010 /* The linking might have altered the window, so let's
1011 * refresh our pointer */
1012 r = journal_file_move_to_object(f, OBJECT_DATA, p, &o);
1013 if (r < 0)
1014 return r;
1015
3c1668da
LP
1016 eq = memchr(data, '=', size);
1017 if (eq && eq > data) {
1018 uint64_t fp;
1019 Object *fo;
1020
1021 /* Create field object ... */
1022 r = journal_file_append_field(f, data, (uint8_t*) eq - (uint8_t*) data, &fo, &fp);
1023 if (r < 0)
1024 return r;
1025
1026 /* ... and link it in. */
1027 o->data.next_field_offset = fo->field.head_data_offset;
1028 fo->field.head_data_offset = le64toh(p);
1029 }
1030
5996c7c2
LP
1031#ifdef HAVE_GCRYPT
1032 r = journal_file_hmac_put_object(f, OBJECT_DATA, o, p);
1033 if (r < 0)
1034 return r;
1035#endif
1036
cec736d2
LP
1037 if (ret)
1038 *ret = o;
1039
1040 if (offset)
de190aef 1041 *offset = p;
cec736d2
LP
1042
1043 return 0;
1044}
1045
1046uint64_t journal_file_entry_n_items(Object *o) {
1047 assert(o);
b588975f
LP
1048
1049 if (o->object.type != OBJECT_ENTRY)
1050 return 0;
cec736d2
LP
1051
1052 return (le64toh(o->object.size) - offsetof(Object, entry.items)) / sizeof(EntryItem);
1053}
1054
0284adc6 1055uint64_t journal_file_entry_array_n_items(Object *o) {
de190aef 1056 assert(o);
b588975f
LP
1057
1058 if (o->object.type != OBJECT_ENTRY_ARRAY)
1059 return 0;
de190aef
LP
1060
1061 return (le64toh(o->object.size) - offsetof(Object, entry_array.items)) / sizeof(uint64_t);
1062}
1063
fb9a24b6
LP
1064uint64_t journal_file_hash_table_n_items(Object *o) {
1065 assert(o);
b588975f
LP
1066
1067 if (o->object.type != OBJECT_DATA_HASH_TABLE &&
1068 o->object.type != OBJECT_FIELD_HASH_TABLE)
1069 return 0;
fb9a24b6
LP
1070
1071 return (le64toh(o->object.size) - offsetof(Object, hash_table.items)) / sizeof(HashItem);
1072}
1073
de190aef 1074static int link_entry_into_array(JournalFile *f,
4fd052ae
FC
1075 le64_t *first,
1076 le64_t *idx,
de190aef 1077 uint64_t p) {
cec736d2 1078 int r;
de190aef
LP
1079 uint64_t n = 0, ap = 0, q, i, a, hidx;
1080 Object *o;
1081
cec736d2 1082 assert(f);
de190aef
LP
1083 assert(first);
1084 assert(idx);
1085 assert(p > 0);
cec736d2 1086
de190aef
LP
1087 a = le64toh(*first);
1088 i = hidx = le64toh(*idx);
1089 while (a > 0) {
1090
1091 r = journal_file_move_to_object(f, OBJECT_ENTRY_ARRAY, a, &o);
1092 if (r < 0)
1093 return r;
cec736d2 1094
de190aef
LP
1095 n = journal_file_entry_array_n_items(o);
1096 if (i < n) {
1097 o->entry_array.items[i] = htole64(p);
1098 *idx = htole64(hidx + 1);
1099 return 0;
1100 }
cec736d2 1101
de190aef
LP
1102 i -= n;
1103 ap = a;
1104 a = le64toh(o->entry_array.next_entry_array_offset);
1105 }
1106
1107 if (hidx > n)
1108 n = (hidx+1) * 2;
1109 else
1110 n = n * 2;
1111
1112 if (n < 4)
1113 n = 4;
1114
1115 r = journal_file_append_object(f, OBJECT_ENTRY_ARRAY,
1116 offsetof(Object, entry_array.items) + n * sizeof(uint64_t),
1117 &o, &q);
cec736d2
LP
1118 if (r < 0)
1119 return r;
1120
feb12d3e 1121#ifdef HAVE_GCRYPT
5996c7c2 1122 r = journal_file_hmac_put_object(f, OBJECT_ENTRY_ARRAY, o, q);
b0af6f41
LP
1123 if (r < 0)
1124 return r;
feb12d3e 1125#endif
b0af6f41 1126
de190aef 1127 o->entry_array.items[i] = htole64(p);
cec736d2 1128
de190aef 1129 if (ap == 0)
7be3aa17 1130 *first = htole64(q);
cec736d2 1131 else {
de190aef 1132 r = journal_file_move_to_object(f, OBJECT_ENTRY_ARRAY, ap, &o);
cec736d2
LP
1133 if (r < 0)
1134 return r;
1135
de190aef
LP
1136 o->entry_array.next_entry_array_offset = htole64(q);
1137 }
cec736d2 1138
2dee23eb
LP
1139 if (JOURNAL_HEADER_CONTAINS(f->header, n_entry_arrays))
1140 f->header->n_entry_arrays = htole64(le64toh(f->header->n_entry_arrays) + 1);
1141
de190aef
LP
1142 *idx = htole64(hidx + 1);
1143
1144 return 0;
1145}
cec736d2 1146
de190aef 1147static int link_entry_into_array_plus_one(JournalFile *f,
4fd052ae
FC
1148 le64_t *extra,
1149 le64_t *first,
1150 le64_t *idx,
de190aef
LP
1151 uint64_t p) {
1152
1153 int r;
1154
1155 assert(f);
1156 assert(extra);
1157 assert(first);
1158 assert(idx);
1159 assert(p > 0);
1160
1161 if (*idx == 0)
1162 *extra = htole64(p);
1163 else {
4fd052ae 1164 le64_t i;
de190aef 1165
7be3aa17 1166 i = htole64(le64toh(*idx) - 1);
de190aef
LP
1167 r = link_entry_into_array(f, first, &i, p);
1168 if (r < 0)
1169 return r;
cec736d2
LP
1170 }
1171
de190aef
LP
1172 *idx = htole64(le64toh(*idx) + 1);
1173 return 0;
1174}
1175
1176static int journal_file_link_entry_item(JournalFile *f, Object *o, uint64_t offset, uint64_t i) {
1177 uint64_t p;
1178 int r;
1179 assert(f);
1180 assert(o);
1181 assert(offset > 0);
1182
1183 p = le64toh(o->entry.items[i].object_offset);
1184 if (p == 0)
1185 return -EINVAL;
1186
1187 r = journal_file_move_to_object(f, OBJECT_DATA, p, &o);
cec736d2
LP
1188 if (r < 0)
1189 return r;
1190
de190aef
LP
1191 return link_entry_into_array_plus_one(f,
1192 &o->data.entry_offset,
1193 &o->data.entry_array_offset,
1194 &o->data.n_entries,
1195 offset);
cec736d2
LP
1196}
1197
1198static int journal_file_link_entry(JournalFile *f, Object *o, uint64_t offset) {
de190aef 1199 uint64_t n, i;
cec736d2
LP
1200 int r;
1201
1202 assert(f);
1203 assert(o);
1204 assert(offset > 0);
b588975f
LP
1205
1206 if (o->object.type != OBJECT_ENTRY)
1207 return -EINVAL;
cec736d2 1208
b788cc23
LP
1209 __sync_synchronize();
1210
cec736d2 1211 /* Link up the entry itself */
de190aef
LP
1212 r = link_entry_into_array(f,
1213 &f->header->entry_array_offset,
1214 &f->header->n_entries,
1215 offset);
1216 if (r < 0)
1217 return r;
cec736d2 1218
507f22bd 1219 /* log_debug("=> %s seqnr=%"PRIu64" n_entries=%"PRIu64, f->path, o->entry.seqnum, f->header->n_entries); */
cec736d2 1220
de190aef 1221 if (f->header->head_entry_realtime == 0)
0ac38b70 1222 f->header->head_entry_realtime = o->entry.realtime;
cec736d2 1223
0ac38b70 1224 f->header->tail_entry_realtime = o->entry.realtime;
de190aef
LP
1225 f->header->tail_entry_monotonic = o->entry.monotonic;
1226
1227 f->tail_entry_monotonic_valid = true;
cec736d2
LP
1228
1229 /* Link up the items */
1230 n = journal_file_entry_n_items(o);
1231 for (i = 0; i < n; i++) {
1232 r = journal_file_link_entry_item(f, o, offset, i);
1233 if (r < 0)
1234 return r;
1235 }
1236
cec736d2
LP
1237 return 0;
1238}
1239
1240static int journal_file_append_entry_internal(
1241 JournalFile *f,
1242 const dual_timestamp *ts,
1243 uint64_t xor_hash,
1244 const EntryItem items[], unsigned n_items,
de190aef 1245 uint64_t *seqnum,
cec736d2
LP
1246 Object **ret, uint64_t *offset) {
1247 uint64_t np;
1248 uint64_t osize;
1249 Object *o;
1250 int r;
1251
1252 assert(f);
1253 assert(items || n_items == 0);
de190aef 1254 assert(ts);
cec736d2
LP
1255
1256 osize = offsetof(Object, entry.items) + (n_items * sizeof(EntryItem));
1257
de190aef 1258 r = journal_file_append_object(f, OBJECT_ENTRY, osize, &o, &np);
cec736d2
LP
1259 if (r < 0)
1260 return r;
1261
d98cc1f2 1262 o->entry.seqnum = htole64(journal_file_entry_seqnum(f, seqnum));
cec736d2 1263 memcpy(o->entry.items, items, n_items * sizeof(EntryItem));
de190aef
LP
1264 o->entry.realtime = htole64(ts->realtime);
1265 o->entry.monotonic = htole64(ts->monotonic);
cec736d2
LP
1266 o->entry.xor_hash = htole64(xor_hash);
1267 o->entry.boot_id = f->header->boot_id;
1268
feb12d3e 1269#ifdef HAVE_GCRYPT
5996c7c2 1270 r = journal_file_hmac_put_object(f, OBJECT_ENTRY, o, np);
b0af6f41
LP
1271 if (r < 0)
1272 return r;
feb12d3e 1273#endif
b0af6f41 1274
cec736d2
LP
1275 r = journal_file_link_entry(f, o, np);
1276 if (r < 0)
1277 return r;
1278
1279 if (ret)
1280 *ret = o;
1281
1282 if (offset)
1283 *offset = np;
1284
1285 return 0;
1286}
1287
cf244689 1288void journal_file_post_change(JournalFile *f) {
50f20cfd
LP
1289 assert(f);
1290
1291 /* inotify() does not receive IN_MODIFY events from file
1292 * accesses done via mmap(). After each access we hence
1293 * trigger IN_MODIFY by truncating the journal file to its
1294 * current size which triggers IN_MODIFY. */
1295
bc85bfee
LP
1296 __sync_synchronize();
1297
50f20cfd 1298 if (ftruncate(f->fd, f->last_stat.st_size) < 0)
c5315881 1299 log_error("Failed to truncate file to its own size: %m");
50f20cfd
LP
1300}
1301
1f2da9ec
LP
1302static int entry_item_cmp(const void *_a, const void *_b) {
1303 const EntryItem *a = _a, *b = _b;
1304
1305 if (le64toh(a->object_offset) < le64toh(b->object_offset))
1306 return -1;
1307 if (le64toh(a->object_offset) > le64toh(b->object_offset))
1308 return 1;
1309 return 0;
1310}
1311
de190aef 1312int journal_file_append_entry(JournalFile *f, const dual_timestamp *ts, const struct iovec iovec[], unsigned n_iovec, uint64_t *seqnum, Object **ret, uint64_t *offset) {
cec736d2
LP
1313 unsigned i;
1314 EntryItem *items;
1315 int r;
1316 uint64_t xor_hash = 0;
de190aef 1317 struct dual_timestamp _ts;
cec736d2
LP
1318
1319 assert(f);
1320 assert(iovec || n_iovec == 0);
1321
de190aef
LP
1322 if (!ts) {
1323 dual_timestamp_get(&_ts);
1324 ts = &_ts;
1325 }
1326
1327 if (f->tail_entry_monotonic_valid &&
1328 ts->monotonic < le64toh(f->header->tail_entry_monotonic))
1329 return -EINVAL;
1330
feb12d3e 1331#ifdef HAVE_GCRYPT
7560fffc
LP
1332 r = journal_file_maybe_append_tag(f, ts->realtime);
1333 if (r < 0)
1334 return r;
feb12d3e 1335#endif
7560fffc 1336
64825d3c 1337 /* alloca() can't take 0, hence let's allocate at least one */
9607d947 1338 items = alloca(sizeof(EntryItem) * MAX(1u, n_iovec));
cec736d2
LP
1339
1340 for (i = 0; i < n_iovec; i++) {
1341 uint64_t p;
1342 Object *o;
1343
1344 r = journal_file_append_data(f, iovec[i].iov_base, iovec[i].iov_len, &o, &p);
1345 if (r < 0)
cf244689 1346 return r;
cec736d2
LP
1347
1348 xor_hash ^= le64toh(o->data.hash);
1349 items[i].object_offset = htole64(p);
de7b95cd 1350 items[i].hash = o->data.hash;
cec736d2
LP
1351 }
1352
1f2da9ec
LP
1353 /* Order by the position on disk, in order to improve seek
1354 * times for rotating media. */
7ff7394d 1355 qsort_safe(items, n_iovec, sizeof(EntryItem), entry_item_cmp);
1f2da9ec 1356
de190aef 1357 r = journal_file_append_entry_internal(f, ts, xor_hash, items, n_iovec, seqnum, ret, offset);
cec736d2 1358
50f20cfd
LP
1359 journal_file_post_change(f);
1360
cec736d2
LP
1361 return r;
1362}
1363
a4bcff5b
LP
1364typedef struct ChainCacheItem {
1365 uint64_t first; /* the array at the begin of the chain */
1366 uint64_t array; /* the cached array */
1367 uint64_t begin; /* the first item in the cached array */
1368 uint64_t total; /* the total number of items in all arrays before this one in the chain */
1369} ChainCacheItem;
1370
1371static void chain_cache_put(
1372 Hashmap *h,
1373 ChainCacheItem *ci,
1374 uint64_t first,
1375 uint64_t array,
1376 uint64_t begin,
1377 uint64_t total) {
1378
1379 if (!ci) {
34741aa3
LP
1380 /* If the chain item to cache for this chain is the
1381 * first one it's not worth caching anything */
1382 if (array == first)
1383 return;
1384
a4bcff5b
LP
1385 if (hashmap_size(h) >= CHAIN_CACHE_MAX)
1386 ci = hashmap_steal_first(h);
1387 else {
1388 ci = new(ChainCacheItem, 1);
1389 if (!ci)
1390 return;
1391 }
1392
1393 ci->first = first;
1394
1395 if (hashmap_put(h, &ci->first, ci) < 0) {
1396 free(ci);
1397 return;
1398 }
1399 } else
1400 assert(ci->first == first);
1401
1402 ci->array = array;
1403 ci->begin = begin;
1404 ci->total = total;
1405}
1406
de190aef
LP
1407static int generic_array_get(JournalFile *f,
1408 uint64_t first,
1409 uint64_t i,
1410 Object **ret, uint64_t *offset) {
1411
cec736d2 1412 Object *o;
a4bcff5b 1413 uint64_t p = 0, a, t = 0;
cec736d2 1414 int r;
a4bcff5b 1415 ChainCacheItem *ci;
cec736d2
LP
1416
1417 assert(f);
1418
de190aef 1419 a = first;
a4bcff5b
LP
1420
1421 /* Try the chain cache first */
1422 ci = hashmap_get(f->chain_cache, &first);
1423 if (ci && i > ci->total) {
1424 a = ci->array;
1425 i -= ci->total;
1426 t = ci->total;
1427 }
1428
de190aef 1429 while (a > 0) {
a4bcff5b 1430 uint64_t k;
cec736d2 1431
de190aef
LP
1432 r = journal_file_move_to_object(f, OBJECT_ENTRY_ARRAY, a, &o);
1433 if (r < 0)
1434 return r;
cec736d2 1435
a4bcff5b
LP
1436 k = journal_file_entry_array_n_items(o);
1437 if (i < k) {
de190aef 1438 p = le64toh(o->entry_array.items[i]);
a4bcff5b 1439 goto found;
cec736d2
LP
1440 }
1441
a4bcff5b
LP
1442 i -= k;
1443 t += k;
de190aef
LP
1444 a = le64toh(o->entry_array.next_entry_array_offset);
1445 }
1446
a4bcff5b
LP
1447 return 0;
1448
1449found:
1450 /* Let's cache this item for the next invocation */
1451 chain_cache_put(f->chain_cache, ci, first, a, o->entry_array.items[0], t);
de190aef
LP
1452
1453 r = journal_file_move_to_object(f, OBJECT_ENTRY, p, &o);
1454 if (r < 0)
1455 return r;
1456
1457 if (ret)
1458 *ret = o;
1459
1460 if (offset)
1461 *offset = p;
1462
1463 return 1;
1464}
1465
1466static int generic_array_get_plus_one(JournalFile *f,
1467 uint64_t extra,
1468 uint64_t first,
1469 uint64_t i,
1470 Object **ret, uint64_t *offset) {
1471
1472 Object *o;
1473
1474 assert(f);
1475
1476 if (i == 0) {
1477 int r;
1478
1479 r = journal_file_move_to_object(f, OBJECT_ENTRY, extra, &o);
cec736d2
LP
1480 if (r < 0)
1481 return r;
1482
de190aef
LP
1483 if (ret)
1484 *ret = o;
cec736d2 1485
de190aef
LP
1486 if (offset)
1487 *offset = extra;
cec736d2 1488
de190aef 1489 return 1;
cec736d2
LP
1490 }
1491
de190aef
LP
1492 return generic_array_get(f, first, i-1, ret, offset);
1493}
cec736d2 1494
de190aef
LP
1495enum {
1496 TEST_FOUND,
1497 TEST_LEFT,
1498 TEST_RIGHT
1499};
cec736d2 1500
de190aef
LP
1501static int generic_array_bisect(JournalFile *f,
1502 uint64_t first,
1503 uint64_t n,
1504 uint64_t needle,
1505 int (*test_object)(JournalFile *f, uint64_t p, uint64_t needle),
1506 direction_t direction,
1507 Object **ret,
1508 uint64_t *offset,
1509 uint64_t *idx) {
1510
1511 uint64_t a, p, t = 0, i = 0, last_p = 0;
1512 bool subtract_one = false;
1513 Object *o, *array = NULL;
1514 int r;
a4bcff5b 1515 ChainCacheItem *ci;
cec736d2 1516
de190aef
LP
1517 assert(f);
1518 assert(test_object);
cec736d2 1519
a4bcff5b 1520 /* Start with the first array in the chain */
de190aef 1521 a = first;
a4bcff5b
LP
1522
1523 ci = hashmap_get(f->chain_cache, &first);
1524 if (ci && n > ci->total) {
1525 /* Ah, we have iterated this bisection array chain
1526 * previously! Let's see if we can skip ahead in the
1527 * chain, as far as the last time. But we can't jump
1528 * backwards in the chain, so let's check that
1529 * first. */
1530
1531 r = test_object(f, ci->begin, needle);
1532 if (r < 0)
1533 return r;
1534
1535 if (r == TEST_LEFT) {
1536 /* OK, what we are looking for is right of th
1537 * begin of this EntryArray, so let's jump
1538 * straight to previously cached array in the
1539 * chain */
1540
1541 a = ci->array;
1542 n -= ci->total;
1543 t = ci->total;
1544 }
1545 }
1546
de190aef
LP
1547 while (a > 0) {
1548 uint64_t left, right, k, lp;
1549
1550 r = journal_file_move_to_object(f, OBJECT_ENTRY_ARRAY, a, &array);
cec736d2
LP
1551 if (r < 0)
1552 return r;
1553
de190aef
LP
1554 k = journal_file_entry_array_n_items(array);
1555 right = MIN(k, n);
1556 if (right <= 0)
1557 return 0;
cec736d2 1558
de190aef
LP
1559 i = right - 1;
1560 lp = p = le64toh(array->entry_array.items[i]);
1561 if (p <= 0)
1562 return -EBADMSG;
cec736d2 1563
de190aef
LP
1564 r = test_object(f, p, needle);
1565 if (r < 0)
1566 return r;
cec736d2 1567
de190aef
LP
1568 if (r == TEST_FOUND)
1569 r = direction == DIRECTION_DOWN ? TEST_RIGHT : TEST_LEFT;
1570
1571 if (r == TEST_RIGHT) {
1572 left = 0;
1573 right -= 1;
1574 for (;;) {
1575 if (left == right) {
1576 if (direction == DIRECTION_UP)
1577 subtract_one = true;
1578
1579 i = left;
1580 goto found;
1581 }
1582
1583 assert(left < right);
1584
1585 i = (left + right) / 2;
1586 p = le64toh(array->entry_array.items[i]);
1587 if (p <= 0)
1588 return -EBADMSG;
1589
1590 r = test_object(f, p, needle);
1591 if (r < 0)
1592 return r;
cec736d2 1593
de190aef
LP
1594 if (r == TEST_FOUND)
1595 r = direction == DIRECTION_DOWN ? TEST_RIGHT : TEST_LEFT;
1596
1597 if (r == TEST_RIGHT)
1598 right = i;
1599 else
1600 left = i + 1;
1601 }
1602 }
1603
cbdca852
LP
1604 if (k > n) {
1605 if (direction == DIRECTION_UP) {
1606 i = n;
1607 subtract_one = true;
1608 goto found;
1609 }
1610
cec736d2 1611 return 0;
cbdca852 1612 }
cec736d2 1613
de190aef
LP
1614 last_p = lp;
1615
1616 n -= k;
1617 t += k;
1618 a = le64toh(array->entry_array.next_entry_array_offset);
cec736d2
LP
1619 }
1620
1621 return 0;
de190aef
LP
1622
1623found:
1624 if (subtract_one && t == 0 && i == 0)
1625 return 0;
1626
a4bcff5b
LP
1627 /* Let's cache this item for the next invocation */
1628 chain_cache_put(f->chain_cache, ci, first, a, array->entry_array.items[0], t);
1629
de190aef
LP
1630 if (subtract_one && i == 0)
1631 p = last_p;
1632 else if (subtract_one)
1633 p = le64toh(array->entry_array.items[i-1]);
1634 else
1635 p = le64toh(array->entry_array.items[i]);
1636
1637 r = journal_file_move_to_object(f, OBJECT_ENTRY, p, &o);
1638 if (r < 0)
1639 return r;
1640
1641 if (ret)
1642 *ret = o;
1643
1644 if (offset)
1645 *offset = p;
1646
1647 if (idx)
cbdca852 1648 *idx = t + i + (subtract_one ? -1 : 0);
de190aef
LP
1649
1650 return 1;
cec736d2
LP
1651}
1652
de190aef
LP
1653static int generic_array_bisect_plus_one(JournalFile *f,
1654 uint64_t extra,
1655 uint64_t first,
1656 uint64_t n,
1657 uint64_t needle,
1658 int (*test_object)(JournalFile *f, uint64_t p, uint64_t needle),
1659 direction_t direction,
1660 Object **ret,
1661 uint64_t *offset,
1662 uint64_t *idx) {
1663
cec736d2 1664 int r;
cbdca852
LP
1665 bool step_back = false;
1666 Object *o;
cec736d2
LP
1667
1668 assert(f);
de190aef 1669 assert(test_object);
cec736d2 1670
de190aef
LP
1671 if (n <= 0)
1672 return 0;
cec736d2 1673
de190aef
LP
1674 /* This bisects the array in object 'first', but first checks
1675 * an extra */
de190aef
LP
1676 r = test_object(f, extra, needle);
1677 if (r < 0)
1678 return r;
a536e261
LP
1679
1680 if (r == TEST_FOUND)
1681 r = direction == DIRECTION_DOWN ? TEST_RIGHT : TEST_LEFT;
1682
cbdca852
LP
1683 /* if we are looking with DIRECTION_UP then we need to first
1684 see if in the actual array there is a matching entry, and
1685 return the last one of that. But if there isn't any we need
1686 to return this one. Hence remember this, and return it
1687 below. */
1688 if (r == TEST_LEFT)
1689 step_back = direction == DIRECTION_UP;
de190aef 1690
cbdca852
LP
1691 if (r == TEST_RIGHT) {
1692 if (direction == DIRECTION_DOWN)
1693 goto found;
1694 else
1695 return 0;
a536e261 1696 }
cec736d2 1697
de190aef
LP
1698 r = generic_array_bisect(f, first, n-1, needle, test_object, direction, ret, offset, idx);
1699
cbdca852
LP
1700 if (r == 0 && step_back)
1701 goto found;
1702
ecf68b1d 1703 if (r > 0 && idx)
de190aef
LP
1704 (*idx) ++;
1705
1706 return r;
cbdca852
LP
1707
1708found:
1709 r = journal_file_move_to_object(f, OBJECT_ENTRY, extra, &o);
1710 if (r < 0)
1711 return r;
1712
1713 if (ret)
1714 *ret = o;
1715
1716 if (offset)
1717 *offset = extra;
1718
1719 if (idx)
1720 *idx = 0;
1721
1722 return 1;
1723}
1724
44a6b1b6 1725_pure_ static int test_object_offset(JournalFile *f, uint64_t p, uint64_t needle) {
cbdca852
LP
1726 assert(f);
1727 assert(p > 0);
1728
1729 if (p == needle)
1730 return TEST_FOUND;
1731 else if (p < needle)
1732 return TEST_LEFT;
1733 else
1734 return TEST_RIGHT;
1735}
1736
1737int journal_file_move_to_entry_by_offset(
1738 JournalFile *f,
1739 uint64_t p,
1740 direction_t direction,
1741 Object **ret,
1742 uint64_t *offset) {
1743
1744 return generic_array_bisect(f,
1745 le64toh(f->header->entry_array_offset),
1746 le64toh(f->header->n_entries),
1747 p,
1748 test_object_offset,
1749 direction,
1750 ret, offset, NULL);
de190aef
LP
1751}
1752
cbdca852 1753
de190aef
LP
1754static int test_object_seqnum(JournalFile *f, uint64_t p, uint64_t needle) {
1755 Object *o;
1756 int r;
1757
1758 assert(f);
1759 assert(p > 0);
1760
1761 r = journal_file_move_to_object(f, OBJECT_ENTRY, p, &o);
cec736d2
LP
1762 if (r < 0)
1763 return r;
1764
de190aef
LP
1765 if (le64toh(o->entry.seqnum) == needle)
1766 return TEST_FOUND;
1767 else if (le64toh(o->entry.seqnum) < needle)
1768 return TEST_LEFT;
1769 else
1770 return TEST_RIGHT;
1771}
cec736d2 1772
de190aef
LP
1773int journal_file_move_to_entry_by_seqnum(
1774 JournalFile *f,
1775 uint64_t seqnum,
1776 direction_t direction,
1777 Object **ret,
1778 uint64_t *offset) {
1779
1780 return generic_array_bisect(f,
1781 le64toh(f->header->entry_array_offset),
1782 le64toh(f->header->n_entries),
1783 seqnum,
1784 test_object_seqnum,
1785 direction,
1786 ret, offset, NULL);
1787}
cec736d2 1788
de190aef
LP
1789static int test_object_realtime(JournalFile *f, uint64_t p, uint64_t needle) {
1790 Object *o;
1791 int r;
1792
1793 assert(f);
1794 assert(p > 0);
1795
1796 r = journal_file_move_to_object(f, OBJECT_ENTRY, p, &o);
1797 if (r < 0)
1798 return r;
1799
1800 if (le64toh(o->entry.realtime) == needle)
1801 return TEST_FOUND;
1802 else if (le64toh(o->entry.realtime) < needle)
1803 return TEST_LEFT;
1804 else
1805 return TEST_RIGHT;
cec736d2
LP
1806}
1807
de190aef
LP
1808int journal_file_move_to_entry_by_realtime(
1809 JournalFile *f,
1810 uint64_t realtime,
1811 direction_t direction,
1812 Object **ret,
1813 uint64_t *offset) {
1814
1815 return generic_array_bisect(f,
1816 le64toh(f->header->entry_array_offset),
1817 le64toh(f->header->n_entries),
1818 realtime,
1819 test_object_realtime,
1820 direction,
1821 ret, offset, NULL);
1822}
1823
1824static int test_object_monotonic(JournalFile *f, uint64_t p, uint64_t needle) {
1825 Object *o;
1826 int r;
1827
1828 assert(f);
1829 assert(p > 0);
1830
1831 r = journal_file_move_to_object(f, OBJECT_ENTRY, p, &o);
1832 if (r < 0)
1833 return r;
1834
1835 if (le64toh(o->entry.monotonic) == needle)
1836 return TEST_FOUND;
1837 else if (le64toh(o->entry.monotonic) < needle)
1838 return TEST_LEFT;
1839 else
1840 return TEST_RIGHT;
1841}
1842
47838ab3
ZJS
1843static inline int find_data_object_by_boot_id(
1844 JournalFile *f,
1845 sd_id128_t boot_id,
1846 Object **o,
1847 uint64_t *b) {
1848 char t[sizeof("_BOOT_ID=")-1 + 32 + 1] = "_BOOT_ID=";
1849
1850 sd_id128_to_string(boot_id, t + 9);
1851 return journal_file_find_data_object(f, t, sizeof(t) - 1, o, b);
1852}
1853
de190aef
LP
1854int journal_file_move_to_entry_by_monotonic(
1855 JournalFile *f,
1856 sd_id128_t boot_id,
1857 uint64_t monotonic,
1858 direction_t direction,
1859 Object **ret,
1860 uint64_t *offset) {
1861
de190aef
LP
1862 Object *o;
1863 int r;
1864
cbdca852 1865 assert(f);
de190aef 1866
47838ab3 1867 r = find_data_object_by_boot_id(f, boot_id, &o, NULL);
de190aef
LP
1868 if (r < 0)
1869 return r;
cbdca852 1870 if (r == 0)
de190aef
LP
1871 return -ENOENT;
1872
1873 return generic_array_bisect_plus_one(f,
1874 le64toh(o->data.entry_offset),
1875 le64toh(o->data.entry_array_offset),
1876 le64toh(o->data.n_entries),
1877 monotonic,
1878 test_object_monotonic,
1879 direction,
1880 ret, offset, NULL);
1881}
1882
de190aef
LP
1883int journal_file_next_entry(
1884 JournalFile *f,
1885 Object *o, uint64_t p,
1886 direction_t direction,
1887 Object **ret, uint64_t *offset) {
1888
1889 uint64_t i, n;
cec736d2
LP
1890 int r;
1891
1892 assert(f);
de190aef
LP
1893 assert(p > 0 || !o);
1894
1895 n = le64toh(f->header->n_entries);
1896 if (n <= 0)
1897 return 0;
cec736d2
LP
1898
1899 if (!o)
de190aef 1900 i = direction == DIRECTION_DOWN ? 0 : n - 1;
cec736d2 1901 else {
de190aef 1902 if (o->object.type != OBJECT_ENTRY)
cec736d2
LP
1903 return -EINVAL;
1904
de190aef
LP
1905 r = generic_array_bisect(f,
1906 le64toh(f->header->entry_array_offset),
1907 le64toh(f->header->n_entries),
1908 p,
1909 test_object_offset,
1910 DIRECTION_DOWN,
1911 NULL, NULL,
1912 &i);
1913 if (r <= 0)
1914 return r;
1915
1916 if (direction == DIRECTION_DOWN) {
1917 if (i >= n - 1)
1918 return 0;
1919
1920 i++;
1921 } else {
1922 if (i <= 0)
1923 return 0;
1924
1925 i--;
1926 }
cec736d2
LP
1927 }
1928
de190aef
LP
1929 /* And jump to it */
1930 return generic_array_get(f,
1931 le64toh(f->header->entry_array_offset),
1932 i,
1933 ret, offset);
1934}
cec736d2 1935
de190aef
LP
1936int journal_file_skip_entry(
1937 JournalFile *f,
1938 Object *o, uint64_t p,
1939 int64_t skip,
1940 Object **ret, uint64_t *offset) {
1941
1942 uint64_t i, n;
1943 int r;
1944
1945 assert(f);
1946 assert(o);
1947 assert(p > 0);
1948
1949 if (o->object.type != OBJECT_ENTRY)
1950 return -EINVAL;
1951
1952 r = generic_array_bisect(f,
1953 le64toh(f->header->entry_array_offset),
1954 le64toh(f->header->n_entries),
1955 p,
1956 test_object_offset,
1957 DIRECTION_DOWN,
1958 NULL, NULL,
1959 &i);
1960 if (r <= 0)
cec736d2
LP
1961 return r;
1962
de190aef
LP
1963 /* Calculate new index */
1964 if (skip < 0) {
1965 if ((uint64_t) -skip >= i)
1966 i = 0;
1967 else
1968 i = i - (uint64_t) -skip;
1969 } else
1970 i += (uint64_t) skip;
cec736d2 1971
de190aef
LP
1972 n = le64toh(f->header->n_entries);
1973 if (n <= 0)
1974 return -EBADMSG;
cec736d2 1975
de190aef
LP
1976 if (i >= n)
1977 i = n-1;
1978
1979 return generic_array_get(f,
1980 le64toh(f->header->entry_array_offset),
1981 i,
1982 ret, offset);
cec736d2
LP
1983}
1984
de190aef
LP
1985int journal_file_next_entry_for_data(
1986 JournalFile *f,
1987 Object *o, uint64_t p,
1988 uint64_t data_offset,
1989 direction_t direction,
1990 Object **ret, uint64_t *offset) {
1991
1992 uint64_t n, i;
cec736d2 1993 int r;
de190aef 1994 Object *d;
cec736d2
LP
1995
1996 assert(f);
de190aef 1997 assert(p > 0 || !o);
cec736d2 1998
de190aef 1999 r = journal_file_move_to_object(f, OBJECT_DATA, data_offset, &d);
466ccd92 2000 if (r < 0)
de190aef 2001 return r;
cec736d2 2002
de190aef
LP
2003 n = le64toh(d->data.n_entries);
2004 if (n <= 0)
2005 return n;
cec736d2 2006
de190aef
LP
2007 if (!o)
2008 i = direction == DIRECTION_DOWN ? 0 : n - 1;
2009 else {
2010 if (o->object.type != OBJECT_ENTRY)
2011 return -EINVAL;
cec736d2 2012
de190aef
LP
2013 r = generic_array_bisect_plus_one(f,
2014 le64toh(d->data.entry_offset),
2015 le64toh(d->data.entry_array_offset),
2016 le64toh(d->data.n_entries),
2017 p,
2018 test_object_offset,
2019 DIRECTION_DOWN,
2020 NULL, NULL,
2021 &i);
2022
2023 if (r <= 0)
cec736d2
LP
2024 return r;
2025
de190aef
LP
2026 if (direction == DIRECTION_DOWN) {
2027 if (i >= n - 1)
2028 return 0;
cec736d2 2029
de190aef
LP
2030 i++;
2031 } else {
2032 if (i <= 0)
2033 return 0;
cec736d2 2034
de190aef
LP
2035 i--;
2036 }
cec736d2 2037
de190aef 2038 }
cec736d2 2039
de190aef
LP
2040 return generic_array_get_plus_one(f,
2041 le64toh(d->data.entry_offset),
2042 le64toh(d->data.entry_array_offset),
2043 i,
2044 ret, offset);
2045}
cec736d2 2046
cbdca852
LP
2047int journal_file_move_to_entry_by_offset_for_data(
2048 JournalFile *f,
2049 uint64_t data_offset,
2050 uint64_t p,
2051 direction_t direction,
2052 Object **ret, uint64_t *offset) {
2053
2054 int r;
2055 Object *d;
2056
2057 assert(f);
2058
2059 r = journal_file_move_to_object(f, OBJECT_DATA, data_offset, &d);
2060 if (r < 0)
2061 return r;
2062
2063 return generic_array_bisect_plus_one(f,
2064 le64toh(d->data.entry_offset),
2065 le64toh(d->data.entry_array_offset),
2066 le64toh(d->data.n_entries),
2067 p,
2068 test_object_offset,
2069 direction,
2070 ret, offset, NULL);
2071}
2072
2073int journal_file_move_to_entry_by_monotonic_for_data(
2074 JournalFile *f,
2075 uint64_t data_offset,
2076 sd_id128_t boot_id,
2077 uint64_t monotonic,
2078 direction_t direction,
2079 Object **ret, uint64_t *offset) {
2080
cbdca852
LP
2081 Object *o, *d;
2082 int r;
2083 uint64_t b, z;
2084
2085 assert(f);
2086
2087 /* First, seek by time */
47838ab3 2088 r = find_data_object_by_boot_id(f, boot_id, &o, &b);
cbdca852
LP
2089 if (r < 0)
2090 return r;
2091 if (r == 0)
2092 return -ENOENT;
2093
2094 r = generic_array_bisect_plus_one(f,
2095 le64toh(o->data.entry_offset),
2096 le64toh(o->data.entry_array_offset),
2097 le64toh(o->data.n_entries),
2098 monotonic,
2099 test_object_monotonic,
2100 direction,
2101 NULL, &z, NULL);
2102 if (r <= 0)
2103 return r;
2104
2105 /* And now, continue seeking until we find an entry that
2106 * exists in both bisection arrays */
2107
2108 for (;;) {
2109 Object *qo;
2110 uint64_t p, q;
2111
2112 r = journal_file_move_to_object(f, OBJECT_DATA, data_offset, &d);
2113 if (r < 0)
2114 return r;
2115
2116 r = generic_array_bisect_plus_one(f,
2117 le64toh(d->data.entry_offset),
2118 le64toh(d->data.entry_array_offset),
2119 le64toh(d->data.n_entries),
2120 z,
2121 test_object_offset,
2122 direction,
2123 NULL, &p, NULL);
2124 if (r <= 0)
2125 return r;
2126
2127 r = journal_file_move_to_object(f, OBJECT_DATA, b, &o);
2128 if (r < 0)
2129 return r;
2130
2131 r = generic_array_bisect_plus_one(f,
2132 le64toh(o->data.entry_offset),
2133 le64toh(o->data.entry_array_offset),
2134 le64toh(o->data.n_entries),
2135 p,
2136 test_object_offset,
2137 direction,
2138 &qo, &q, NULL);
2139
2140 if (r <= 0)
2141 return r;
2142
2143 if (p == q) {
2144 if (ret)
2145 *ret = qo;
2146 if (offset)
2147 *offset = q;
2148
2149 return 1;
2150 }
2151
2152 z = q;
2153 }
2154
2155 return 0;
2156}
2157
de190aef
LP
2158int journal_file_move_to_entry_by_seqnum_for_data(
2159 JournalFile *f,
2160 uint64_t data_offset,
2161 uint64_t seqnum,
2162 direction_t direction,
2163 Object **ret, uint64_t *offset) {
cec736d2 2164
de190aef
LP
2165 Object *d;
2166 int r;
cec736d2 2167
91a31dde
LP
2168 assert(f);
2169
de190aef 2170 r = journal_file_move_to_object(f, OBJECT_DATA, data_offset, &d);
91a31dde 2171 if (r < 0)
de190aef 2172 return r;
cec736d2 2173
de190aef
LP
2174 return generic_array_bisect_plus_one(f,
2175 le64toh(d->data.entry_offset),
2176 le64toh(d->data.entry_array_offset),
2177 le64toh(d->data.n_entries),
2178 seqnum,
2179 test_object_seqnum,
2180 direction,
2181 ret, offset, NULL);
2182}
cec736d2 2183
de190aef
LP
2184int journal_file_move_to_entry_by_realtime_for_data(
2185 JournalFile *f,
2186 uint64_t data_offset,
2187 uint64_t realtime,
2188 direction_t direction,
2189 Object **ret, uint64_t *offset) {
2190
2191 Object *d;
2192 int r;
2193
91a31dde
LP
2194 assert(f);
2195
de190aef 2196 r = journal_file_move_to_object(f, OBJECT_DATA, data_offset, &d);
91a31dde 2197 if (r < 0)
de190aef
LP
2198 return r;
2199
2200 return generic_array_bisect_plus_one(f,
2201 le64toh(d->data.entry_offset),
2202 le64toh(d->data.entry_array_offset),
2203 le64toh(d->data.n_entries),
2204 realtime,
2205 test_object_realtime,
2206 direction,
2207 ret, offset, NULL);
cec736d2
LP
2208}
2209
0284adc6 2210void journal_file_dump(JournalFile *f) {
7560fffc 2211 Object *o;
7560fffc 2212 int r;
0284adc6 2213 uint64_t p;
7560fffc
LP
2214
2215 assert(f);
2216
0284adc6 2217 journal_file_print_header(f);
7560fffc 2218
0284adc6
LP
2219 p = le64toh(f->header->header_size);
2220 while (p != 0) {
2221 r = journal_file_move_to_object(f, -1, p, &o);
2222 if (r < 0)
2223 goto fail;
7560fffc 2224
0284adc6 2225 switch (o->object.type) {
d98cc1f2 2226
0284adc6
LP
2227 case OBJECT_UNUSED:
2228 printf("Type: OBJECT_UNUSED\n");
2229 break;
d98cc1f2 2230
0284adc6
LP
2231 case OBJECT_DATA:
2232 printf("Type: OBJECT_DATA\n");
2233 break;
7560fffc 2234
3c1668da
LP
2235 case OBJECT_FIELD:
2236 printf("Type: OBJECT_FIELD\n");
2237 break;
2238
0284adc6 2239 case OBJECT_ENTRY:
507f22bd
ZJS
2240 printf("Type: OBJECT_ENTRY seqnum=%"PRIu64" monotonic=%"PRIu64" realtime=%"PRIu64"\n",
2241 le64toh(o->entry.seqnum),
2242 le64toh(o->entry.monotonic),
2243 le64toh(o->entry.realtime));
0284adc6 2244 break;
7560fffc 2245
0284adc6
LP
2246 case OBJECT_FIELD_HASH_TABLE:
2247 printf("Type: OBJECT_FIELD_HASH_TABLE\n");
2248 break;
7560fffc 2249
0284adc6
LP
2250 case OBJECT_DATA_HASH_TABLE:
2251 printf("Type: OBJECT_DATA_HASH_TABLE\n");
2252 break;
7560fffc 2253
0284adc6
LP
2254 case OBJECT_ENTRY_ARRAY:
2255 printf("Type: OBJECT_ENTRY_ARRAY\n");
2256 break;
7560fffc 2257
0284adc6 2258 case OBJECT_TAG:
507f22bd
ZJS
2259 printf("Type: OBJECT_TAG seqnum=%"PRIu64" epoch=%"PRIu64"\n",
2260 le64toh(o->tag.seqnum),
2261 le64toh(o->tag.epoch));
0284adc6 2262 break;
3c1668da
LP
2263
2264 default:
2265 printf("Type: unknown (%u)\n", o->object.type);
2266 break;
0284adc6 2267 }
7560fffc 2268
0284adc6
LP
2269 if (o->object.flags & OBJECT_COMPRESSED)
2270 printf("Flags: COMPRESSED\n");
7560fffc 2271
0284adc6
LP
2272 if (p == le64toh(f->header->tail_object_offset))
2273 p = 0;
2274 else
2275 p = p + ALIGN64(le64toh(o->object.size));
2276 }
7560fffc 2277
0284adc6
LP
2278 return;
2279fail:
2280 log_error("File corrupt");
7560fffc
LP
2281}
2282
718fe4b1
ZJS
2283static const char* format_timestamp_safe(char *buf, size_t l, usec_t t) {
2284 const char *x;
2285
2286 x = format_timestamp(buf, l, t);
2287 if (x)
2288 return x;
2289 return " --- ";
2290}
2291
0284adc6 2292void journal_file_print_header(JournalFile *f) {
2765b7bb 2293 char a[33], b[33], c[33], d[33];
ed375beb 2294 char x[FORMAT_TIMESTAMP_MAX], y[FORMAT_TIMESTAMP_MAX], z[FORMAT_TIMESTAMP_MAX];
a1a03e30
LP
2295 struct stat st;
2296 char bytes[FORMAT_BYTES_MAX];
7560fffc
LP
2297
2298 assert(f);
7560fffc 2299
0284adc6
LP
2300 printf("File Path: %s\n"
2301 "File ID: %s\n"
2302 "Machine ID: %s\n"
2303 "Boot ID: %s\n"
2304 "Sequential Number ID: %s\n"
2305 "State: %s\n"
2306 "Compatible Flags:%s%s\n"
2307 "Incompatible Flags:%s%s\n"
507f22bd
ZJS
2308 "Header size: %"PRIu64"\n"
2309 "Arena size: %"PRIu64"\n"
2310 "Data Hash Table Size: %"PRIu64"\n"
2311 "Field Hash Table Size: %"PRIu64"\n"
0284adc6 2312 "Rotate Suggested: %s\n"
507f22bd
ZJS
2313 "Head Sequential Number: %"PRIu64"\n"
2314 "Tail Sequential Number: %"PRIu64"\n"
0284adc6 2315 "Head Realtime Timestamp: %s\n"
3223f44f 2316 "Tail Realtime Timestamp: %s\n"
ed375beb 2317 "Tail Monotonic Timestamp: %s\n"
507f22bd
ZJS
2318 "Objects: %"PRIu64"\n"
2319 "Entry Objects: %"PRIu64"\n",
0284adc6
LP
2320 f->path,
2321 sd_id128_to_string(f->header->file_id, a),
2322 sd_id128_to_string(f->header->machine_id, b),
2323 sd_id128_to_string(f->header->boot_id, c),
2765b7bb 2324 sd_id128_to_string(f->header->seqnum_id, d),
3223f44f
LP
2325 f->header->state == STATE_OFFLINE ? "OFFLINE" :
2326 f->header->state == STATE_ONLINE ? "ONLINE" :
2327 f->header->state == STATE_ARCHIVED ? "ARCHIVED" : "UNKNOWN",
8088cbd3
LP
2328 JOURNAL_HEADER_SEALED(f->header) ? " SEALED" : "",
2329 (le32toh(f->header->compatible_flags) & ~HEADER_COMPATIBLE_SEALED) ? " ???" : "",
2330 JOURNAL_HEADER_COMPRESSED(f->header) ? " COMPRESSED" : "",
2331 (le32toh(f->header->incompatible_flags) & ~HEADER_INCOMPATIBLE_COMPRESSED) ? " ???" : "",
507f22bd
ZJS
2332 le64toh(f->header->header_size),
2333 le64toh(f->header->arena_size),
2334 le64toh(f->header->data_hash_table_size) / sizeof(HashItem),
2335 le64toh(f->header->field_hash_table_size) / sizeof(HashItem),
fb0951b0 2336 yes_no(journal_file_rotate_suggested(f, 0)),
507f22bd
ZJS
2337 le64toh(f->header->head_entry_seqnum),
2338 le64toh(f->header->tail_entry_seqnum),
718fe4b1
ZJS
2339 format_timestamp_safe(x, sizeof(x), le64toh(f->header->head_entry_realtime)),
2340 format_timestamp_safe(y, sizeof(y), le64toh(f->header->tail_entry_realtime)),
ed375beb 2341 format_timespan(z, sizeof(z), le64toh(f->header->tail_entry_monotonic), USEC_PER_MSEC),
507f22bd
ZJS
2342 le64toh(f->header->n_objects),
2343 le64toh(f->header->n_entries));
7560fffc 2344
0284adc6 2345 if (JOURNAL_HEADER_CONTAINS(f->header, n_data))
507f22bd 2346 printf("Data Objects: %"PRIu64"\n"
0284adc6 2347 "Data Hash Table Fill: %.1f%%\n",
507f22bd 2348 le64toh(f->header->n_data),
0284adc6 2349 100.0 * (double) le64toh(f->header->n_data) / ((double) (le64toh(f->header->data_hash_table_size) / sizeof(HashItem))));
7560fffc 2350
0284adc6 2351 if (JOURNAL_HEADER_CONTAINS(f->header, n_fields))
507f22bd 2352 printf("Field Objects: %"PRIu64"\n"
0284adc6 2353 "Field Hash Table Fill: %.1f%%\n",
507f22bd 2354 le64toh(f->header->n_fields),
0284adc6 2355 100.0 * (double) le64toh(f->header->n_fields) / ((double) (le64toh(f->header->field_hash_table_size) / sizeof(HashItem))));
3223f44f
LP
2356
2357 if (JOURNAL_HEADER_CONTAINS(f->header, n_tags))
507f22bd
ZJS
2358 printf("Tag Objects: %"PRIu64"\n",
2359 le64toh(f->header->n_tags));
3223f44f 2360 if (JOURNAL_HEADER_CONTAINS(f->header, n_entry_arrays))
507f22bd
ZJS
2361 printf("Entry Array Objects: %"PRIu64"\n",
2362 le64toh(f->header->n_entry_arrays));
a1a03e30
LP
2363
2364 if (fstat(f->fd, &st) >= 0)
2365 printf("Disk usage: %s\n", format_bytes(bytes, sizeof(bytes), (off_t) st.st_blocks * 512ULL));
7560fffc
LP
2366}
2367
0284adc6
LP
2368int journal_file_open(
2369 const char *fname,
2370 int flags,
2371 mode_t mode,
2372 bool compress,
baed47c3 2373 bool seal,
0284adc6
LP
2374 JournalMetrics *metrics,
2375 MMapCache *mmap_cache,
2376 JournalFile *template,
2377 JournalFile **ret) {
7560fffc 2378
0284adc6
LP
2379 JournalFile *f;
2380 int r;
2381 bool newly_created = false;
7560fffc 2382
0284adc6 2383 assert(fname);
0559d3a5 2384 assert(ret);
7560fffc 2385
0284adc6
LP
2386 if ((flags & O_ACCMODE) != O_RDONLY &&
2387 (flags & O_ACCMODE) != O_RDWR)
2388 return -EINVAL;
7560fffc 2389
a0108012
LP
2390 if (!endswith(fname, ".journal") &&
2391 !endswith(fname, ".journal~"))
0284adc6 2392 return -EINVAL;
7560fffc 2393
0284adc6
LP
2394 f = new0(JournalFile, 1);
2395 if (!f)
2396 return -ENOMEM;
7560fffc 2397
0284adc6
LP
2398 f->fd = -1;
2399 f->mode = mode;
7560fffc 2400
0284adc6
LP
2401 f->flags = flags;
2402 f->prot = prot_from_flags(flags);
2403 f->writable = (flags & O_ACCMODE) != O_RDONLY;
48b61739 2404#ifdef HAVE_XZ
0284adc6 2405 f->compress = compress;
48b61739 2406#endif
49a32d43 2407#ifdef HAVE_GCRYPT
baed47c3 2408 f->seal = seal;
49a32d43 2409#endif
7560fffc 2410
0284adc6
LP
2411 if (mmap_cache)
2412 f->mmap = mmap_cache_ref(mmap_cache);
2413 else {
84168d80 2414 f->mmap = mmap_cache_new();
0284adc6
LP
2415 if (!f->mmap) {
2416 r = -ENOMEM;
2417 goto fail;
2418 }
2419 }
7560fffc 2420
0284adc6
LP
2421 f->path = strdup(fname);
2422 if (!f->path) {
2423 r = -ENOMEM;
2424 goto fail;
2425 }
7560fffc 2426
a4bcff5b
LP
2427 f->chain_cache = hashmap_new(uint64_hash_func, uint64_compare_func);
2428 if (!f->chain_cache) {
2429 r = -ENOMEM;
2430 goto fail;
2431 }
2432
0284adc6
LP
2433 f->fd = open(f->path, f->flags|O_CLOEXEC, f->mode);
2434 if (f->fd < 0) {
2435 r = -errno;
2436 goto fail;
7560fffc 2437 }
7560fffc 2438
0284adc6
LP
2439 if (fstat(f->fd, &f->last_stat) < 0) {
2440 r = -errno;
2441 goto fail;
2442 }
7560fffc 2443
0284adc6 2444 if (f->last_stat.st_size == 0 && f->writable) {
fb0951b0
LP
2445#ifdef HAVE_XATTR
2446 uint64_t crtime;
2447
2448 /* Let's attach the creation time to the journal file,
2449 * so that the vacuuming code knows the age of this
2450 * file even if the file might end up corrupted one
2451 * day... Ideally we'd just use the creation time many
2452 * file systems maintain for each file, but there is
2453 * currently no usable API to query this, hence let's
2454 * emulate this via extended attributes. If extended
2455 * attributes are not supported we'll just skip this,
2456 * and rely solely on mtime/atime/ctime of the file.*/
2457
2458 crtime = htole64((uint64_t) now(CLOCK_REALTIME));
2459 fsetxattr(f->fd, "user.crtime_usec", &crtime, sizeof(crtime), XATTR_CREATE);
2460#endif
7560fffc 2461
feb12d3e 2462#ifdef HAVE_GCRYPT
0284adc6 2463 /* Try to load the FSPRG state, and if we can't, then
baed47c3 2464 * just don't do sealing */
49a32d43
LP
2465 if (f->seal) {
2466 r = journal_file_fss_load(f);
2467 if (r < 0)
2468 f->seal = false;
2469 }
feb12d3e 2470#endif
7560fffc 2471
0284adc6
LP
2472 r = journal_file_init_header(f, template);
2473 if (r < 0)
2474 goto fail;
7560fffc 2475
0284adc6
LP
2476 if (fstat(f->fd, &f->last_stat) < 0) {
2477 r = -errno;
2478 goto fail;
2479 }
fb0951b0
LP
2480
2481 newly_created = true;
0284adc6 2482 }
7560fffc 2483
0284adc6
LP
2484 if (f->last_stat.st_size < (off_t) HEADER_SIZE_MIN) {
2485 r = -EIO;
2486 goto fail;
2487 }
7560fffc 2488
0284adc6
LP
2489 f->header = mmap(NULL, PAGE_ALIGN(sizeof(Header)), prot_from_flags(flags), MAP_SHARED, f->fd, 0);
2490 if (f->header == MAP_FAILED) {
2491 f->header = NULL;
2492 r = -errno;
2493 goto fail;
2494 }
7560fffc 2495
0284adc6
LP
2496 if (!newly_created) {
2497 r = journal_file_verify_header(f);
2498 if (r < 0)
2499 goto fail;
2500 }
7560fffc 2501
feb12d3e 2502#ifdef HAVE_GCRYPT
0284adc6 2503 if (!newly_created && f->writable) {
baed47c3 2504 r = journal_file_fss_load(f);
0284adc6
LP
2505 if (r < 0)
2506 goto fail;
2507 }
feb12d3e 2508#endif
cec736d2
LP
2509
2510 if (f->writable) {
4a92baf3
LP
2511 if (metrics) {
2512 journal_default_metrics(metrics, f->fd);
2513 f->metrics = *metrics;
2514 } else if (template)
2515 f->metrics = template->metrics;
2516
cec736d2
LP
2517 r = journal_file_refresh_header(f);
2518 if (r < 0)
2519 goto fail;
2520 }
2521
feb12d3e 2522#ifdef HAVE_GCRYPT
baed47c3 2523 r = journal_file_hmac_setup(f);
14d10188
LP
2524 if (r < 0)
2525 goto fail;
feb12d3e 2526#endif
14d10188 2527
cec736d2 2528 if (newly_created) {
de190aef 2529 r = journal_file_setup_field_hash_table(f);
cec736d2
LP
2530 if (r < 0)
2531 goto fail;
2532
de190aef 2533 r = journal_file_setup_data_hash_table(f);
cec736d2
LP
2534 if (r < 0)
2535 goto fail;
7560fffc 2536
feb12d3e 2537#ifdef HAVE_GCRYPT
7560fffc
LP
2538 r = journal_file_append_first_tag(f);
2539 if (r < 0)
2540 goto fail;
feb12d3e 2541#endif
cec736d2
LP
2542 }
2543
de190aef 2544 r = journal_file_map_field_hash_table(f);
cec736d2
LP
2545 if (r < 0)
2546 goto fail;
2547
de190aef 2548 r = journal_file_map_data_hash_table(f);
cec736d2
LP
2549 if (r < 0)
2550 goto fail;
2551
0559d3a5 2552 *ret = f;
cec736d2
LP
2553 return 0;
2554
2555fail:
2556 journal_file_close(f);
2557
2558 return r;
2559}
0ac38b70 2560
baed47c3 2561int journal_file_rotate(JournalFile **f, bool compress, bool seal) {
57535f47 2562 _cleanup_free_ char *p = NULL;
0ac38b70
LP
2563 size_t l;
2564 JournalFile *old_file, *new_file = NULL;
2565 int r;
2566
2567 assert(f);
2568 assert(*f);
2569
2570 old_file = *f;
2571
2572 if (!old_file->writable)
2573 return -EINVAL;
2574
2575 if (!endswith(old_file->path, ".journal"))
2576 return -EINVAL;
2577
2578 l = strlen(old_file->path);
57535f47
ZJS
2579 r = asprintf(&p, "%.*s@" SD_ID128_FORMAT_STR "-%016"PRIx64"-%016"PRIx64".journal",
2580 (int) l - 8, old_file->path,
2581 SD_ID128_FORMAT_VAL(old_file->header->seqnum_id),
2582 le64toh((*f)->header->head_entry_seqnum),
2583 le64toh((*f)->header->head_entry_realtime));
2584 if (r < 0)
0ac38b70
LP
2585 return -ENOMEM;
2586
0ac38b70 2587 r = rename(old_file->path, p);
0ac38b70
LP
2588 if (r < 0)
2589 return -errno;
2590
ccdbaf91 2591 old_file->header->state = STATE_ARCHIVED;
0ac38b70 2592
baed47c3 2593 r = journal_file_open(old_file->path, old_file->flags, old_file->mode, compress, seal, NULL, old_file->mmap, old_file, &new_file);
0ac38b70
LP
2594 journal_file_close(old_file);
2595
2596 *f = new_file;
2597 return r;
2598}
2599
9447a7f1
LP
2600int journal_file_open_reliably(
2601 const char *fname,
2602 int flags,
2603 mode_t mode,
7560fffc 2604 bool compress,
baed47c3 2605 bool seal,
4a92baf3 2606 JournalMetrics *metrics,
27370278 2607 MMapCache *mmap_cache,
9447a7f1
LP
2608 JournalFile *template,
2609 JournalFile **ret) {
2610
2611 int r;
2612 size_t l;
ed375beb 2613 _cleanup_free_ char *p = NULL;
9447a7f1 2614
baed47c3 2615 r = journal_file_open(fname, flags, mode, compress, seal,
27370278 2616 metrics, mmap_cache, template, ret);
0071d9f1
LP
2617 if (r != -EBADMSG && /* corrupted */
2618 r != -ENODATA && /* truncated */
2619 r != -EHOSTDOWN && /* other machine */
a1a1898f
LP
2620 r != -EPROTONOSUPPORT && /* incompatible feature */
2621 r != -EBUSY && /* unclean shutdown */
2622 r != -ESHUTDOWN /* already archived */)
9447a7f1
LP
2623 return r;
2624
2625 if ((flags & O_ACCMODE) == O_RDONLY)
2626 return r;
2627
2628 if (!(flags & O_CREAT))
2629 return r;
2630
7560fffc
LP
2631 if (!endswith(fname, ".journal"))
2632 return r;
2633
5c70eab4
LP
2634 /* The file is corrupted. Rotate it away and try it again (but only once) */
2635
9447a7f1
LP
2636 l = strlen(fname);
2637 if (asprintf(&p, "%.*s@%016llx-%016llx.journal~",
57535f47 2638 (int) l - 8, fname,
9447a7f1
LP
2639 (unsigned long long) now(CLOCK_REALTIME),
2640 random_ull()) < 0)
2641 return -ENOMEM;
2642
2643 r = rename(fname, p);
9447a7f1
LP
2644 if (r < 0)
2645 return -errno;
2646
a1a1898f 2647 log_warning("File %s corrupted or uncleanly shut down, renaming and replacing.", fname);
9447a7f1 2648
baed47c3 2649 return journal_file_open(fname, flags, mode, compress, seal,
27370278 2650 metrics, mmap_cache, template, ret);
9447a7f1
LP
2651}
2652
cf244689
LP
2653int journal_file_copy_entry(JournalFile *from, JournalFile *to, Object *o, uint64_t p, uint64_t *seqnum, Object **ret, uint64_t *offset) {
2654 uint64_t i, n;
2655 uint64_t q, xor_hash = 0;
2656 int r;
2657 EntryItem *items;
2658 dual_timestamp ts;
2659
2660 assert(from);
2661 assert(to);
2662 assert(o);
2663 assert(p);
2664
2665 if (!to->writable)
2666 return -EPERM;
2667
2668 ts.monotonic = le64toh(o->entry.monotonic);
2669 ts.realtime = le64toh(o->entry.realtime);
2670
2671 if (to->tail_entry_monotonic_valid &&
2672 ts.monotonic < le64toh(to->header->tail_entry_monotonic))
2673 return -EINVAL;
2674
cf244689
LP
2675 n = journal_file_entry_n_items(o);
2676 items = alloca(sizeof(EntryItem) * n);
2677
2678 for (i = 0; i < n; i++) {
4fd052ae
FC
2679 uint64_t l, h;
2680 le64_t le_hash;
cf244689
LP
2681 size_t t;
2682 void *data;
2683 Object *u;
2684
2685 q = le64toh(o->entry.items[i].object_offset);
2686 le_hash = o->entry.items[i].hash;
2687
2688 r = journal_file_move_to_object(from, OBJECT_DATA, q, &o);
2689 if (r < 0)
2690 return r;
2691
2692 if (le_hash != o->data.hash)
2693 return -EBADMSG;
2694
2695 l = le64toh(o->object.size) - offsetof(Object, data.payload);
2696 t = (size_t) l;
2697
2698 /* We hit the limit on 32bit machines */
2699 if ((uint64_t) t != l)
2700 return -E2BIG;
2701
2702 if (o->object.flags & OBJECT_COMPRESSED) {
2703#ifdef HAVE_XZ
2704 uint64_t rsize;
2705
93b73b06 2706 if (!uncompress_blob(o->data.payload, l, &from->compress_buffer, &from->compress_buffer_size, &rsize, 0))
cf244689
LP
2707 return -EBADMSG;
2708
2709 data = from->compress_buffer;
2710 l = rsize;
2711#else
2712 return -EPROTONOSUPPORT;
2713#endif
2714 } else
2715 data = o->data.payload;
2716
2717 r = journal_file_append_data(to, data, l, &u, &h);
2718 if (r < 0)
2719 return r;
2720
2721 xor_hash ^= le64toh(u->data.hash);
2722 items[i].object_offset = htole64(h);
2723 items[i].hash = u->data.hash;
2724
2725 r = journal_file_move_to_object(from, OBJECT_ENTRY, p, &o);
2726 if (r < 0)
2727 return r;
2728 }
2729
2730 return journal_file_append_entry_internal(to, &ts, xor_hash, items, n, seqnum, ret, offset);
2731}
babfc091
LP
2732
2733void journal_default_metrics(JournalMetrics *m, int fd) {
2734 uint64_t fs_size = 0;
2735 struct statvfs ss;
a7bc2c2a 2736 char a[FORMAT_BYTES_MAX], b[FORMAT_BYTES_MAX], c[FORMAT_BYTES_MAX], d[FORMAT_BYTES_MAX];
babfc091
LP
2737
2738 assert(m);
2739 assert(fd >= 0);
2740
2741 if (fstatvfs(fd, &ss) >= 0)
2742 fs_size = ss.f_frsize * ss.f_blocks;
2743
2744 if (m->max_use == (uint64_t) -1) {
2745
2746 if (fs_size > 0) {
2747 m->max_use = PAGE_ALIGN(fs_size / 10); /* 10% of file system size */
2748
2749 if (m->max_use > DEFAULT_MAX_USE_UPPER)
2750 m->max_use = DEFAULT_MAX_USE_UPPER;
2751
2752 if (m->max_use < DEFAULT_MAX_USE_LOWER)
2753 m->max_use = DEFAULT_MAX_USE_LOWER;
2754 } else
2755 m->max_use = DEFAULT_MAX_USE_LOWER;
2756 } else {
2757 m->max_use = PAGE_ALIGN(m->max_use);
2758
2759 if (m->max_use < JOURNAL_FILE_SIZE_MIN*2)
2760 m->max_use = JOURNAL_FILE_SIZE_MIN*2;
2761 }
2762
2763 if (m->max_size == (uint64_t) -1) {
2764 m->max_size = PAGE_ALIGN(m->max_use / 8); /* 8 chunks */
2765
2766 if (m->max_size > DEFAULT_MAX_SIZE_UPPER)
2767 m->max_size = DEFAULT_MAX_SIZE_UPPER;
2768 } else
2769 m->max_size = PAGE_ALIGN(m->max_size);
2770
2771 if (m->max_size < JOURNAL_FILE_SIZE_MIN)
2772 m->max_size = JOURNAL_FILE_SIZE_MIN;
2773
2774 if (m->max_size*2 > m->max_use)
2775 m->max_use = m->max_size*2;
2776
2777 if (m->min_size == (uint64_t) -1)
2778 m->min_size = JOURNAL_FILE_SIZE_MIN;
2779 else {
2780 m->min_size = PAGE_ALIGN(m->min_size);
2781
2782 if (m->min_size < JOURNAL_FILE_SIZE_MIN)
2783 m->min_size = JOURNAL_FILE_SIZE_MIN;
2784
2785 if (m->min_size > m->max_size)
2786 m->max_size = m->min_size;
2787 }
2788
2789 if (m->keep_free == (uint64_t) -1) {
2790
2791 if (fs_size > 0) {
8621b110 2792 m->keep_free = PAGE_ALIGN(fs_size * 3 / 20); /* 15% of file system size */
babfc091
LP
2793
2794 if (m->keep_free > DEFAULT_KEEP_FREE_UPPER)
2795 m->keep_free = DEFAULT_KEEP_FREE_UPPER;
2796
2797 } else
2798 m->keep_free = DEFAULT_KEEP_FREE;
2799 }
2800
2b43f939
LP
2801 log_debug("Fixed max_use=%s max_size=%s min_size=%s keep_free=%s",
2802 format_bytes(a, sizeof(a), m->max_use),
2803 format_bytes(b, sizeof(b), m->max_size),
2804 format_bytes(c, sizeof(c), m->min_size),
2805 format_bytes(d, sizeof(d), m->keep_free));
babfc091 2806}
08984293
LP
2807
2808int journal_file_get_cutoff_realtime_usec(JournalFile *f, usec_t *from, usec_t *to) {
08984293
LP
2809 assert(f);
2810 assert(from || to);
2811
2812 if (from) {
162566a4
LP
2813 if (f->header->head_entry_realtime == 0)
2814 return -ENOENT;
08984293 2815
162566a4 2816 *from = le64toh(f->header->head_entry_realtime);
08984293
LP
2817 }
2818
2819 if (to) {
162566a4
LP
2820 if (f->header->tail_entry_realtime == 0)
2821 return -ENOENT;
08984293 2822
162566a4 2823 *to = le64toh(f->header->tail_entry_realtime);
08984293
LP
2824 }
2825
2826 return 1;
2827}
2828
2829int journal_file_get_cutoff_monotonic_usec(JournalFile *f, sd_id128_t boot_id, usec_t *from, usec_t *to) {
08984293
LP
2830 Object *o;
2831 uint64_t p;
2832 int r;
2833
2834 assert(f);
2835 assert(from || to);
2836
47838ab3 2837 r = find_data_object_by_boot_id(f, boot_id, &o, &p);
08984293
LP
2838 if (r <= 0)
2839 return r;
2840
2841 if (le64toh(o->data.n_entries) <= 0)
2842 return 0;
2843
2844 if (from) {
2845 r = journal_file_move_to_object(f, OBJECT_ENTRY, le64toh(o->data.entry_offset), &o);
2846 if (r < 0)
2847 return r;
2848
2849 *from = le64toh(o->entry.monotonic);
2850 }
2851
2852 if (to) {
2853 r = journal_file_move_to_object(f, OBJECT_DATA, p, &o);
2854 if (r < 0)
2855 return r;
2856
2857 r = generic_array_get_plus_one(f,
2858 le64toh(o->data.entry_offset),
2859 le64toh(o->data.entry_array_offset),
2860 le64toh(o->data.n_entries)-1,
2861 &o, NULL);
2862 if (r <= 0)
2863 return r;
2864
2865 *to = le64toh(o->entry.monotonic);
2866 }
2867
2868 return 1;
2869}
dca6219e 2870
fb0951b0 2871bool journal_file_rotate_suggested(JournalFile *f, usec_t max_file_usec) {
dca6219e
LP
2872 assert(f);
2873
2874 /* If we gained new header fields we gained new features,
2875 * hence suggest a rotation */
361f9cbc
LP
2876 if (le64toh(f->header->header_size) < sizeof(Header)) {
2877 log_debug("%s uses an outdated header, suggesting rotation.", f->path);
dca6219e 2878 return true;
361f9cbc 2879 }
dca6219e
LP
2880
2881 /* Let's check if the hash tables grew over a certain fill
2882 * level (75%, borrowing this value from Java's hash table
2883 * implementation), and if so suggest a rotation. To calculate
2884 * the fill level we need the n_data field, which only exists
2885 * in newer versions. */
2886
2887 if (JOURNAL_HEADER_CONTAINS(f->header, n_data))
361f9cbc 2888 if (le64toh(f->header->n_data) * 4ULL > (le64toh(f->header->data_hash_table_size) / sizeof(HashItem)) * 3ULL) {
507f22bd 2889 log_debug("Data hash table of %s has a fill level at %.1f (%"PRIu64" of %"PRIu64" items, %llu file size, %"PRIu64" bytes per hash table item), suggesting rotation.",
361f9cbc
LP
2890 f->path,
2891 100.0 * (double) le64toh(f->header->n_data) / ((double) (le64toh(f->header->data_hash_table_size) / sizeof(HashItem))),
507f22bd
ZJS
2892 le64toh(f->header->n_data),
2893 le64toh(f->header->data_hash_table_size) / sizeof(HashItem),
2894 (unsigned long long) f->last_stat.st_size,
2895 f->last_stat.st_size / le64toh(f->header->n_data));
dca6219e 2896 return true;
361f9cbc 2897 }
dca6219e
LP
2898
2899 if (JOURNAL_HEADER_CONTAINS(f->header, n_fields))
361f9cbc 2900 if (le64toh(f->header->n_fields) * 4ULL > (le64toh(f->header->field_hash_table_size) / sizeof(HashItem)) * 3ULL) {
507f22bd 2901 log_debug("Field hash table of %s has a fill level at %.1f (%"PRIu64" of %"PRIu64" items), suggesting rotation.",
361f9cbc
LP
2902 f->path,
2903 100.0 * (double) le64toh(f->header->n_fields) / ((double) (le64toh(f->header->field_hash_table_size) / sizeof(HashItem))),
507f22bd
ZJS
2904 le64toh(f->header->n_fields),
2905 le64toh(f->header->field_hash_table_size) / sizeof(HashItem));
dca6219e 2906 return true;
361f9cbc 2907 }
dca6219e 2908
0598fd4a
LP
2909 /* Are the data objects properly indexed by field objects? */
2910 if (JOURNAL_HEADER_CONTAINS(f->header, n_data) &&
2911 JOURNAL_HEADER_CONTAINS(f->header, n_fields) &&
2912 le64toh(f->header->n_data) > 0 &&
2913 le64toh(f->header->n_fields) == 0)
2914 return true;
2915
fb0951b0
LP
2916 if (max_file_usec > 0) {
2917 usec_t t, h;
2918
2919 h = le64toh(f->header->head_entry_realtime);
2920 t = now(CLOCK_REALTIME);
2921
2922 if (h > 0 && t > h + max_file_usec)
2923 return true;
2924 }
2925
dca6219e
LP
2926 return false;
2927}