]> git.ipfire.org Git - thirdparty/systemd.git/blame - src/journal/journal-file.c
update TODO
[thirdparty/systemd.git] / src / journal / journal-file.c
CommitLineData
cec736d2
LP
1/*-*- Mode: C; c-basic-offset: 8; indent-tabs-mode: nil -*-*/
2
3/***
4 This file is part of systemd.
5
6 Copyright 2011 Lennart Poettering
7
8 systemd is free software; you can redistribute it and/or modify it
5430f7f2
LP
9 under the terms of the GNU Lesser General Public License as published by
10 the Free Software Foundation; either version 2.1 of the License, or
cec736d2
LP
11 (at your option) any later version.
12
13 systemd is distributed in the hope that it will be useful, but
14 WITHOUT ANY WARRANTY; without even the implied warranty of
15 MERCHANTABILITY or FITNESS FOR A PARTICULAR PURPOSE. See the GNU
5430f7f2 16 Lesser General Public License for more details.
cec736d2 17
5430f7f2 18 You should have received a copy of the GNU Lesser General Public License
cec736d2
LP
19 along with systemd; If not, see <http://www.gnu.org/licenses/>.
20***/
21
22#include <sys/mman.h>
23#include <errno.h>
24#include <sys/uio.h>
25#include <unistd.h>
26#include <sys/statvfs.h>
27#include <fcntl.h>
28#include <stddef.h>
29
fb0951b0
LP
30#ifdef HAVE_XATTR
31#include <attr/xattr.h>
32#endif
33
cec736d2
LP
34#include "journal-def.h"
35#include "journal-file.h"
0284adc6 36#include "journal-authenticate.h"
cec736d2 37#include "lookup3.h"
807e17f0 38#include "compress.h"
7560fffc 39#include "fsprg.h"
cec736d2 40
4a92baf3
LP
41#define DEFAULT_DATA_HASH_TABLE_SIZE (2047ULL*sizeof(HashItem))
42#define DEFAULT_FIELD_HASH_TABLE_SIZE (333ULL*sizeof(HashItem))
cec736d2 43
be19b7df 44#define COMPRESSION_SIZE_THRESHOLD (512ULL)
807e17f0 45
babfc091 46/* This is the minimum journal file size */
253f59df 47#define JOURNAL_FILE_SIZE_MIN (4ULL*1024ULL*1024ULL) /* 4 MiB */
babfc091
LP
48
49/* These are the lower and upper bounds if we deduce the max_use value
50 * from the file system size */
51#define DEFAULT_MAX_USE_LOWER (1ULL*1024ULL*1024ULL) /* 1 MiB */
52#define DEFAULT_MAX_USE_UPPER (4ULL*1024ULL*1024ULL*1024ULL) /* 4 GiB */
53
54/* This is the upper bound if we deduce max_size from max_use */
71100051 55#define DEFAULT_MAX_SIZE_UPPER (128ULL*1024ULL*1024ULL) /* 128 MiB */
babfc091
LP
56
57/* This is the upper bound if we deduce the keep_free value from the
58 * file system size */
59#define DEFAULT_KEEP_FREE_UPPER (4ULL*1024ULL*1024ULL*1024ULL) /* 4 GiB */
60
61/* This is the keep_free value when we can't determine the system
62 * size */
63#define DEFAULT_KEEP_FREE (1024ULL*1024ULL) /* 1 MB */
64
dca6219e
LP
65/* n_data was the first entry we added after the initial file format design */
66#define HEADER_SIZE_MIN ALIGN64(offsetof(Header, n_data))
cec736d2 67
a4bcff5b
LP
68/* How many entries to keep in the entry array chain cache at max */
69#define CHAIN_CACHE_MAX 20
70
a676e665
LP
71/* How much to increase the journal file size at once each time we allocate something new. */
72#define FILE_SIZE_INCREASE (8ULL*1024ULL*1024ULL) /* 8MB */
73
9588bc32 74static int journal_file_set_online(JournalFile *f) {
26687bf8
OS
75 assert(f);
76
77 if (!f->writable)
78 return -EPERM;
79
80 if (!(f->fd >= 0 && f->header))
81 return -EINVAL;
82
83 switch(f->header->state) {
84 case STATE_ONLINE:
85 return 0;
86
87 case STATE_OFFLINE:
88 f->header->state = STATE_ONLINE;
89 fsync(f->fd);
90 return 0;
91
92 default:
93 return -EINVAL;
94 }
95}
96
97int journal_file_set_offline(JournalFile *f) {
98 assert(f);
99
100 if (!f->writable)
101 return -EPERM;
102
103 if (!(f->fd >= 0 && f->header))
104 return -EINVAL;
105
106 if (f->header->state != STATE_ONLINE)
107 return 0;
108
109 fsync(f->fd);
110
111 f->header->state = STATE_OFFLINE;
112
113 fsync(f->fd);
114
115 return 0;
116}
117
cec736d2 118void journal_file_close(JournalFile *f) {
de190aef 119 assert(f);
cec736d2 120
feb12d3e 121#ifdef HAVE_GCRYPT
b0af6f41 122 /* Write the final tag */
c586dbf1 123 if (f->seal && f->writable)
b0af6f41 124 journal_file_append_tag(f);
feb12d3e 125#endif
b0af6f41 126
7560fffc 127 /* Sync everything to disk, before we mark the file offline */
16e9f408
LP
128 if (f->mmap && f->fd >= 0)
129 mmap_cache_close_fd(f->mmap, f->fd);
7560fffc 130
26687bf8 131 journal_file_set_offline(f);
cec736d2 132
26687bf8 133 if (f->header)
d384c7a8 134 munmap(f->header, PAGE_ALIGN(sizeof(Header)));
cec736d2 135
0ac38b70
LP
136 if (f->fd >= 0)
137 close_nointr_nofail(f->fd);
138
cec736d2 139 free(f->path);
807e17f0 140
16e9f408
LP
141 if (f->mmap)
142 mmap_cache_unref(f->mmap);
143
a4bcff5b
LP
144 hashmap_free_free(f->chain_cache);
145
807e17f0
LP
146#ifdef HAVE_XZ
147 free(f->compress_buffer);
148#endif
149
7560fffc 150#ifdef HAVE_GCRYPT
baed47c3
LP
151 if (f->fss_file)
152 munmap(f->fss_file, PAGE_ALIGN(f->fss_file_size));
b7c9ae91
LP
153 else if (f->fsprg_state)
154 free(f->fsprg_state);
155
156 free(f->fsprg_seed);
7560fffc
LP
157
158 if (f->hmac)
159 gcry_md_close(f->hmac);
160#endif
161
cec736d2
LP
162 free(f);
163}
164
0ac38b70 165static int journal_file_init_header(JournalFile *f, JournalFile *template) {
cec736d2
LP
166 Header h;
167 ssize_t k;
168 int r;
169
170 assert(f);
171
172 zero(h);
7560fffc 173 memcpy(h.signature, HEADER_SIGNATURE, 8);
23b0b2b2 174 h.header_size = htole64(ALIGN64(sizeof(h)));
cec736d2 175
7560fffc
LP
176 h.incompatible_flags =
177 htole32(f->compress ? HEADER_INCOMPATIBLE_COMPRESSED : 0);
178
179 h.compatible_flags =
baed47c3 180 htole32(f->seal ? HEADER_COMPATIBLE_SEALED : 0);
7560fffc 181
cec736d2
LP
182 r = sd_id128_randomize(&h.file_id);
183 if (r < 0)
184 return r;
185
0ac38b70
LP
186 if (template) {
187 h.seqnum_id = template->header->seqnum_id;
beec0085 188 h.tail_entry_seqnum = template->header->tail_entry_seqnum;
0ac38b70
LP
189 } else
190 h.seqnum_id = h.file_id;
cec736d2
LP
191
192 k = pwrite(f->fd, &h, sizeof(h), 0);
193 if (k < 0)
194 return -errno;
195
196 if (k != sizeof(h))
197 return -EIO;
198
199 return 0;
200}
201
202static int journal_file_refresh_header(JournalFile *f) {
203 int r;
de190aef 204 sd_id128_t boot_id;
cec736d2
LP
205
206 assert(f);
207
208 r = sd_id128_get_machine(&f->header->machine_id);
209 if (r < 0)
210 return r;
211
de190aef 212 r = sd_id128_get_boot(&boot_id);
cec736d2
LP
213 if (r < 0)
214 return r;
215
de190aef
LP
216 if (sd_id128_equal(boot_id, f->header->boot_id))
217 f->tail_entry_monotonic_valid = true;
218
219 f->header->boot_id = boot_id;
220
26687bf8 221 journal_file_set_online(f);
b788cc23 222
7560fffc 223 /* Sync the online state to disk */
a676e665 224 fsync(f->fd);
b788cc23 225
cec736d2
LP
226 return 0;
227}
228
229static int journal_file_verify_header(JournalFile *f) {
230 assert(f);
231
7560fffc 232 if (memcmp(f->header->signature, HEADER_SIGNATURE, 8))
cec736d2
LP
233 return -EBADMSG;
234
7560fffc
LP
235 /* In both read and write mode we refuse to open files with
236 * incompatible flags we don't know */
807e17f0 237#ifdef HAVE_XZ
7560fffc 238 if ((le32toh(f->header->incompatible_flags) & ~HEADER_INCOMPATIBLE_COMPRESSED) != 0)
807e17f0
LP
239 return -EPROTONOSUPPORT;
240#else
cec736d2
LP
241 if (f->header->incompatible_flags != 0)
242 return -EPROTONOSUPPORT;
807e17f0 243#endif
cec736d2 244
7560fffc
LP
245 /* When open for writing we refuse to open files with
246 * compatible flags, too */
247 if (f->writable) {
248#ifdef HAVE_GCRYPT
baed47c3 249 if ((le32toh(f->header->compatible_flags) & ~HEADER_COMPATIBLE_SEALED) != 0)
7560fffc
LP
250 return -EPROTONOSUPPORT;
251#else
252 if (f->header->compatible_flags != 0)
253 return -EPROTONOSUPPORT;
254#endif
255 }
256
db11ac1a
LP
257 if (f->header->state >= _STATE_MAX)
258 return -EBADMSG;
259
dca6219e
LP
260 /* The first addition was n_data, so check that we are at least this large */
261 if (le64toh(f->header->header_size) < HEADER_SIZE_MIN)
23b0b2b2
LP
262 return -EBADMSG;
263
8088cbd3 264 if (JOURNAL_HEADER_SEALED(f->header) && !JOURNAL_HEADER_CONTAINS(f->header, n_entry_arrays))
beec0085
LP
265 return -EBADMSG;
266
db11ac1a
LP
267 if ((le64toh(f->header->header_size) + le64toh(f->header->arena_size)) > (uint64_t) f->last_stat.st_size)
268 return -ENODATA;
269
270 if (le64toh(f->header->tail_object_offset) > (le64toh(f->header->header_size) + le64toh(f->header->arena_size)))
271 return -ENODATA;
272
7762e02b
LP
273 if (!VALID64(le64toh(f->header->data_hash_table_offset)) ||
274 !VALID64(le64toh(f->header->field_hash_table_offset)) ||
275 !VALID64(le64toh(f->header->tail_object_offset)) ||
276 !VALID64(le64toh(f->header->entry_array_offset)))
277 return -ENODATA;
278
279 if (le64toh(f->header->data_hash_table_offset) < le64toh(f->header->header_size) ||
280 le64toh(f->header->field_hash_table_offset) < le64toh(f->header->header_size) ||
281 le64toh(f->header->tail_object_offset) < le64toh(f->header->header_size) ||
282 le64toh(f->header->entry_array_offset) < le64toh(f->header->header_size))
cec736d2
LP
283 return -ENODATA;
284
285 if (f->writable) {
ccdbaf91 286 uint8_t state;
cec736d2
LP
287 sd_id128_t machine_id;
288 int r;
289
290 r = sd_id128_get_machine(&machine_id);
291 if (r < 0)
292 return r;
293
294 if (!sd_id128_equal(machine_id, f->header->machine_id))
295 return -EHOSTDOWN;
296
de190aef 297 state = f->header->state;
cec736d2 298
71fa6f00
LP
299 if (state == STATE_ONLINE) {
300 log_debug("Journal file %s is already online. Assuming unclean closing.", f->path);
301 return -EBUSY;
302 } else if (state == STATE_ARCHIVED)
cec736d2 303 return -ESHUTDOWN;
71fa6f00
LP
304 else if (state != STATE_OFFLINE) {
305 log_debug("Journal file %s has unknown state %u.", f->path, state);
306 return -EBUSY;
307 }
cec736d2
LP
308 }
309
8088cbd3 310 f->compress = JOURNAL_HEADER_COMPRESSED(f->header);
c586dbf1 311
f1889c91 312 f->seal = JOURNAL_HEADER_SEALED(f->header);
7560fffc 313
cec736d2
LP
314 return 0;
315}
316
317static int journal_file_allocate(JournalFile *f, uint64_t offset, uint64_t size) {
eda4b58b 318 uint64_t old_size, new_size;
fec2aa2f 319 int r;
cec736d2
LP
320
321 assert(f);
322
cec736d2 323 /* We assume that this file is not sparse, and we know that
38ac38b2 324 * for sure, since we always call posix_fallocate()
cec736d2
LP
325 * ourselves */
326
327 old_size =
23b0b2b2 328 le64toh(f->header->header_size) +
cec736d2
LP
329 le64toh(f->header->arena_size);
330
bc85bfee 331 new_size = PAGE_ALIGN(offset + size);
23b0b2b2
LP
332 if (new_size < le64toh(f->header->header_size))
333 new_size = le64toh(f->header->header_size);
bc85bfee
LP
334
335 if (new_size <= old_size)
cec736d2
LP
336 return 0;
337
a676e665 338 if (f->metrics.max_size > 0 && new_size > f->metrics.max_size)
bc85bfee 339 return -E2BIG;
cec736d2 340
a676e665 341 if (new_size > f->metrics.min_size && f->metrics.keep_free > 0) {
cec736d2
LP
342 struct statvfs svfs;
343
344 if (fstatvfs(f->fd, &svfs) >= 0) {
345 uint64_t available;
346
347 available = svfs.f_bfree * svfs.f_bsize;
348
bc85bfee
LP
349 if (available >= f->metrics.keep_free)
350 available -= f->metrics.keep_free;
cec736d2
LP
351 else
352 available = 0;
353
354 if (new_size - old_size > available)
355 return -E2BIG;
356 }
357 }
358
eda4b58b
LP
359 /* Increase by larger blocks at once */
360 new_size = ((new_size+FILE_SIZE_INCREASE-1) / FILE_SIZE_INCREASE) * FILE_SIZE_INCREASE;
361 if (f->metrics.max_size > 0 && new_size > f->metrics.max_size)
362 new_size = f->metrics.max_size;
363
bc85bfee
LP
364 /* Note that the glibc fallocate() fallback is very
365 inefficient, hence we try to minimize the allocation area
366 as we can. */
fec2aa2f
GV
367 r = posix_fallocate(f->fd, old_size, new_size - old_size);
368 if (r != 0)
369 return -r;
cec736d2 370
eda4b58b
LP
371 if (fstat(f->fd, &f->last_stat) < 0)
372 return -errno;
cec736d2 373
23b0b2b2 374 f->header->arena_size = htole64(new_size - le64toh(f->header->header_size));
cec736d2
LP
375
376 return 0;
377}
378
fcde2389 379static int journal_file_move_to(JournalFile *f, int context, bool keep_always, uint64_t offset, uint64_t size, void **ret) {
cec736d2 380 assert(f);
cec736d2
LP
381 assert(ret);
382
7762e02b
LP
383 if (size <= 0)
384 return -EINVAL;
385
2a59ea54 386 /* Avoid SIGBUS on invalid accesses */
4bbdcdb3
LP
387 if (offset + size > (uint64_t) f->last_stat.st_size) {
388 /* Hmm, out of range? Let's refresh the fstat() data
389 * first, before we trust that check. */
390
391 if (fstat(f->fd, &f->last_stat) < 0 ||
392 offset + size > (uint64_t) f->last_stat.st_size)
393 return -EADDRNOTAVAIL;
394 }
395
fcde2389 396 return mmap_cache_get(f->mmap, f->fd, f->prot, context, keep_always, offset, size, &f->last_stat, ret);
cec736d2
LP
397}
398
16e9f408
LP
399static uint64_t minimum_header_size(Object *o) {
400
b8e891e6 401 static const uint64_t table[] = {
16e9f408
LP
402 [OBJECT_DATA] = sizeof(DataObject),
403 [OBJECT_FIELD] = sizeof(FieldObject),
404 [OBJECT_ENTRY] = sizeof(EntryObject),
405 [OBJECT_DATA_HASH_TABLE] = sizeof(HashTableObject),
406 [OBJECT_FIELD_HASH_TABLE] = sizeof(HashTableObject),
407 [OBJECT_ENTRY_ARRAY] = sizeof(EntryArrayObject),
408 [OBJECT_TAG] = sizeof(TagObject),
409 };
410
411 if (o->object.type >= ELEMENTSOF(table) || table[o->object.type] <= 0)
412 return sizeof(ObjectHeader);
413
414 return table[o->object.type];
415}
416
de190aef 417int journal_file_move_to_object(JournalFile *f, int type, uint64_t offset, Object **ret) {
cec736d2
LP
418 int r;
419 void *t;
420 Object *o;
421 uint64_t s;
422
423 assert(f);
424 assert(ret);
425
db11ac1a
LP
426 /* Objects may only be located at multiple of 64 bit */
427 if (!VALID64(offset))
428 return -EFAULT;
429
16e9f408 430
ae97089d 431 r = journal_file_move_to(f, type_to_context(type), false, offset, sizeof(ObjectHeader), &t);
cec736d2
LP
432 if (r < 0)
433 return r;
434
435 o = (Object*) t;
436 s = le64toh(o->object.size);
437
438 if (s < sizeof(ObjectHeader))
439 return -EBADMSG;
440
16e9f408
LP
441 if (o->object.type <= OBJECT_UNUSED)
442 return -EBADMSG;
443
444 if (s < minimum_header_size(o))
445 return -EBADMSG;
446
3c1668da 447 if (type > 0 && o->object.type != type)
cec736d2
LP
448 return -EBADMSG;
449
450 if (s > sizeof(ObjectHeader)) {
fcde2389 451 r = journal_file_move_to(f, o->object.type, false, offset, s, &t);
cec736d2
LP
452 if (r < 0)
453 return r;
454
455 o = (Object*) t;
456 }
457
cec736d2
LP
458 *ret = o;
459 return 0;
460}
461
d98cc1f2 462static uint64_t journal_file_entry_seqnum(JournalFile *f, uint64_t *seqnum) {
cec736d2
LP
463 uint64_t r;
464
465 assert(f);
466
beec0085 467 r = le64toh(f->header->tail_entry_seqnum) + 1;
c2373f84
LP
468
469 if (seqnum) {
de190aef 470 /* If an external seqnum counter was passed, we update
c2373f84
LP
471 * both the local and the external one, and set it to
472 * the maximum of both */
473
474 if (*seqnum + 1 > r)
475 r = *seqnum + 1;
476
477 *seqnum = r;
478 }
479
beec0085 480 f->header->tail_entry_seqnum = htole64(r);
cec736d2 481
beec0085
LP
482 if (f->header->head_entry_seqnum == 0)
483 f->header->head_entry_seqnum = htole64(r);
de190aef 484
cec736d2
LP
485 return r;
486}
487
0284adc6 488int journal_file_append_object(JournalFile *f, int type, uint64_t size, Object **ret, uint64_t *offset) {
cec736d2
LP
489 int r;
490 uint64_t p;
491 Object *tail, *o;
492 void *t;
493
494 assert(f);
16e9f408 495 assert(type > 0 && type < _OBJECT_TYPE_MAX);
cec736d2
LP
496 assert(size >= sizeof(ObjectHeader));
497 assert(offset);
498 assert(ret);
499
26687bf8
OS
500 r = journal_file_set_online(f);
501 if (r < 0)
502 return r;
503
cec736d2 504 p = le64toh(f->header->tail_object_offset);
cec736d2 505 if (p == 0)
23b0b2b2 506 p = le64toh(f->header->header_size);
cec736d2 507 else {
de190aef 508 r = journal_file_move_to_object(f, -1, p, &tail);
cec736d2
LP
509 if (r < 0)
510 return r;
511
512 p += ALIGN64(le64toh(tail->object.size));
513 }
514
515 r = journal_file_allocate(f, p, size);
516 if (r < 0)
517 return r;
518
fcde2389 519 r = journal_file_move_to(f, type, false, p, size, &t);
cec736d2
LP
520 if (r < 0)
521 return r;
522
523 o = (Object*) t;
524
525 zero(o->object);
de190aef 526 o->object.type = type;
cec736d2
LP
527 o->object.size = htole64(size);
528
529 f->header->tail_object_offset = htole64(p);
cec736d2
LP
530 f->header->n_objects = htole64(le64toh(f->header->n_objects) + 1);
531
532 *ret = o;
533 *offset = p;
534
535 return 0;
536}
537
de190aef 538static int journal_file_setup_data_hash_table(JournalFile *f) {
cec736d2
LP
539 uint64_t s, p;
540 Object *o;
541 int r;
542
543 assert(f);
544
dfabe643 545 /* We estimate that we need 1 hash table entry per 768 of
4a92baf3
LP
546 journal file and we want to make sure we never get beyond
547 75% fill level. Calculate the hash table size for the
548 maximum file size based on these metrics. */
549
dfabe643 550 s = (f->metrics.max_size * 4 / 768 / 3) * sizeof(HashItem);
4a92baf3
LP
551 if (s < DEFAULT_DATA_HASH_TABLE_SIZE)
552 s = DEFAULT_DATA_HASH_TABLE_SIZE;
553
507f22bd 554 log_debug("Reserving %"PRIu64" entries in hash table.", s / sizeof(HashItem));
4a92baf3 555
de190aef
LP
556 r = journal_file_append_object(f,
557 OBJECT_DATA_HASH_TABLE,
558 offsetof(Object, hash_table.items) + s,
559 &o, &p);
cec736d2
LP
560 if (r < 0)
561 return r;
562
29804cc1 563 memzero(o->hash_table.items, s);
cec736d2 564
de190aef
LP
565 f->header->data_hash_table_offset = htole64(p + offsetof(Object, hash_table.items));
566 f->header->data_hash_table_size = htole64(s);
cec736d2
LP
567
568 return 0;
569}
570
de190aef 571static int journal_file_setup_field_hash_table(JournalFile *f) {
cec736d2
LP
572 uint64_t s, p;
573 Object *o;
574 int r;
575
576 assert(f);
577
3c1668da
LP
578 /* We use a fixed size hash table for the fields as this
579 * number should grow very slowly only */
580
de190aef
LP
581 s = DEFAULT_FIELD_HASH_TABLE_SIZE;
582 r = journal_file_append_object(f,
583 OBJECT_FIELD_HASH_TABLE,
584 offsetof(Object, hash_table.items) + s,
585 &o, &p);
cec736d2
LP
586 if (r < 0)
587 return r;
588
29804cc1 589 memzero(o->hash_table.items, s);
cec736d2 590
de190aef
LP
591 f->header->field_hash_table_offset = htole64(p + offsetof(Object, hash_table.items));
592 f->header->field_hash_table_size = htole64(s);
cec736d2
LP
593
594 return 0;
595}
596
de190aef 597static int journal_file_map_data_hash_table(JournalFile *f) {
cec736d2
LP
598 uint64_t s, p;
599 void *t;
600 int r;
601
602 assert(f);
603
de190aef
LP
604 p = le64toh(f->header->data_hash_table_offset);
605 s = le64toh(f->header->data_hash_table_size);
cec736d2 606
de190aef 607 r = journal_file_move_to(f,
16e9f408 608 OBJECT_DATA_HASH_TABLE,
fcde2389 609 true,
de190aef
LP
610 p, s,
611 &t);
cec736d2
LP
612 if (r < 0)
613 return r;
614
de190aef 615 f->data_hash_table = t;
cec736d2
LP
616 return 0;
617}
618
de190aef 619static int journal_file_map_field_hash_table(JournalFile *f) {
cec736d2
LP
620 uint64_t s, p;
621 void *t;
622 int r;
623
624 assert(f);
625
de190aef
LP
626 p = le64toh(f->header->field_hash_table_offset);
627 s = le64toh(f->header->field_hash_table_size);
cec736d2 628
de190aef 629 r = journal_file_move_to(f,
16e9f408 630 OBJECT_FIELD_HASH_TABLE,
fcde2389 631 true,
de190aef
LP
632 p, s,
633 &t);
cec736d2
LP
634 if (r < 0)
635 return r;
636
de190aef 637 f->field_hash_table = t;
cec736d2
LP
638 return 0;
639}
640
3c1668da
LP
641static int journal_file_link_field(
642 JournalFile *f,
643 Object *o,
644 uint64_t offset,
645 uint64_t hash) {
646
647 uint64_t p, h;
648 int r;
649
650 assert(f);
651 assert(o);
652 assert(offset > 0);
653
654 if (o->object.type != OBJECT_FIELD)
655 return -EINVAL;
656
657 /* This might alter the window we are looking at */
658
659 o->field.next_hash_offset = o->field.head_data_offset = 0;
660
661 h = hash % (le64toh(f->header->field_hash_table_size) / sizeof(HashItem));
662 p = le64toh(f->field_hash_table[h].tail_hash_offset);
663 if (p == 0)
664 f->field_hash_table[h].head_hash_offset = htole64(offset);
665 else {
666 r = journal_file_move_to_object(f, OBJECT_FIELD, p, &o);
667 if (r < 0)
668 return r;
669
670 o->field.next_hash_offset = htole64(offset);
671 }
672
673 f->field_hash_table[h].tail_hash_offset = htole64(offset);
674
675 if (JOURNAL_HEADER_CONTAINS(f->header, n_fields))
676 f->header->n_fields = htole64(le64toh(f->header->n_fields) + 1);
677
678 return 0;
679}
680
681static int journal_file_link_data(
682 JournalFile *f,
683 Object *o,
684 uint64_t offset,
685 uint64_t hash) {
686
de190aef 687 uint64_t p, h;
cec736d2
LP
688 int r;
689
690 assert(f);
691 assert(o);
692 assert(offset > 0);
b588975f
LP
693
694 if (o->object.type != OBJECT_DATA)
695 return -EINVAL;
cec736d2 696
48496df6
LP
697 /* This might alter the window we are looking at */
698
de190aef
LP
699 o->data.next_hash_offset = o->data.next_field_offset = 0;
700 o->data.entry_offset = o->data.entry_array_offset = 0;
701 o->data.n_entries = 0;
cec736d2 702
de190aef 703 h = hash % (le64toh(f->header->data_hash_table_size) / sizeof(HashItem));
8db4213e 704 p = le64toh(f->data_hash_table[h].tail_hash_offset);
3c1668da 705 if (p == 0)
cec736d2 706 /* Only entry in the hash table is easy */
de190aef 707 f->data_hash_table[h].head_hash_offset = htole64(offset);
3c1668da 708 else {
48496df6
LP
709 /* Move back to the previous data object, to patch in
710 * pointer */
cec736d2 711
de190aef 712 r = journal_file_move_to_object(f, OBJECT_DATA, p, &o);
cec736d2
LP
713 if (r < 0)
714 return r;
715
de190aef 716 o->data.next_hash_offset = htole64(offset);
cec736d2
LP
717 }
718
de190aef 719 f->data_hash_table[h].tail_hash_offset = htole64(offset);
cec736d2 720
dca6219e
LP
721 if (JOURNAL_HEADER_CONTAINS(f->header, n_data))
722 f->header->n_data = htole64(le64toh(f->header->n_data) + 1);
723
cec736d2
LP
724 return 0;
725}
726
3c1668da
LP
727int journal_file_find_field_object_with_hash(
728 JournalFile *f,
729 const void *field, uint64_t size, uint64_t hash,
730 Object **ret, uint64_t *offset) {
731
732 uint64_t p, osize, h;
733 int r;
734
735 assert(f);
736 assert(field && size > 0);
737
738 osize = offsetof(Object, field.payload) + size;
739
740 if (f->header->field_hash_table_size == 0)
741 return -EBADMSG;
742
743 h = hash % (le64toh(f->header->field_hash_table_size) / sizeof(HashItem));
744 p = le64toh(f->field_hash_table[h].head_hash_offset);
745
746 while (p > 0) {
747 Object *o;
748
749 r = journal_file_move_to_object(f, OBJECT_FIELD, p, &o);
750 if (r < 0)
751 return r;
752
753 if (le64toh(o->field.hash) == hash &&
754 le64toh(o->object.size) == osize &&
755 memcmp(o->field.payload, field, size) == 0) {
756
757 if (ret)
758 *ret = o;
759 if (offset)
760 *offset = p;
761
762 return 1;
763 }
764
765 p = le64toh(o->field.next_hash_offset);
766 }
767
768 return 0;
769}
770
771int journal_file_find_field_object(
772 JournalFile *f,
773 const void *field, uint64_t size,
774 Object **ret, uint64_t *offset) {
775
776 uint64_t hash;
777
778 assert(f);
779 assert(field && size > 0);
780
781 hash = hash64(field, size);
782
783 return journal_file_find_field_object_with_hash(f,
784 field, size, hash,
785 ret, offset);
786}
787
de190aef
LP
788int journal_file_find_data_object_with_hash(
789 JournalFile *f,
790 const void *data, uint64_t size, uint64_t hash,
791 Object **ret, uint64_t *offset) {
48496df6 792
de190aef 793 uint64_t p, osize, h;
cec736d2
LP
794 int r;
795
796 assert(f);
797 assert(data || size == 0);
798
799 osize = offsetof(Object, data.payload) + size;
800
bc85bfee
LP
801 if (f->header->data_hash_table_size == 0)
802 return -EBADMSG;
803
de190aef
LP
804 h = hash % (le64toh(f->header->data_hash_table_size) / sizeof(HashItem));
805 p = le64toh(f->data_hash_table[h].head_hash_offset);
cec736d2 806
de190aef
LP
807 while (p > 0) {
808 Object *o;
cec736d2 809
de190aef 810 r = journal_file_move_to_object(f, OBJECT_DATA, p, &o);
cec736d2
LP
811 if (r < 0)
812 return r;
813
807e17f0 814 if (le64toh(o->data.hash) != hash)
85a131e8 815 goto next;
807e17f0
LP
816
817 if (o->object.flags & OBJECT_COMPRESSED) {
818#ifdef HAVE_XZ
b785c858 819 uint64_t l, rsize;
cec736d2 820
807e17f0
LP
821 l = le64toh(o->object.size);
822 if (l <= offsetof(Object, data.payload))
cec736d2
LP
823 return -EBADMSG;
824
807e17f0
LP
825 l -= offsetof(Object, data.payload);
826
93b73b06 827 if (!uncompress_blob(o->data.payload, l, &f->compress_buffer, &f->compress_buffer_size, &rsize, 0))
807e17f0
LP
828 return -EBADMSG;
829
b785c858 830 if (rsize == size &&
807e17f0
LP
831 memcmp(f->compress_buffer, data, size) == 0) {
832
833 if (ret)
834 *ret = o;
835
836 if (offset)
837 *offset = p;
838
839 return 1;
840 }
841#else
842 return -EPROTONOSUPPORT;
843#endif
844
845 } else if (le64toh(o->object.size) == osize &&
846 memcmp(o->data.payload, data, size) == 0) {
847
cec736d2
LP
848 if (ret)
849 *ret = o;
850
851 if (offset)
852 *offset = p;
853
de190aef 854 return 1;
cec736d2
LP
855 }
856
85a131e8 857 next:
cec736d2
LP
858 p = le64toh(o->data.next_hash_offset);
859 }
860
de190aef
LP
861 return 0;
862}
863
864int journal_file_find_data_object(
865 JournalFile *f,
866 const void *data, uint64_t size,
867 Object **ret, uint64_t *offset) {
868
869 uint64_t hash;
870
871 assert(f);
872 assert(data || size == 0);
873
874 hash = hash64(data, size);
875
876 return journal_file_find_data_object_with_hash(f,
877 data, size, hash,
878 ret, offset);
879}
880
3c1668da
LP
881static int journal_file_append_field(
882 JournalFile *f,
883 const void *field, uint64_t size,
884 Object **ret, uint64_t *offset) {
885
886 uint64_t hash, p;
887 uint64_t osize;
888 Object *o;
889 int r;
890
891 assert(f);
892 assert(field && size > 0);
893
894 hash = hash64(field, size);
895
896 r = journal_file_find_field_object_with_hash(f, field, size, hash, &o, &p);
897 if (r < 0)
898 return r;
899 else if (r > 0) {
900
901 if (ret)
902 *ret = o;
903
904 if (offset)
905 *offset = p;
906
907 return 0;
908 }
909
910 osize = offsetof(Object, field.payload) + size;
911 r = journal_file_append_object(f, OBJECT_FIELD, osize, &o, &p);
8c92d4bb
LP
912 if (r < 0)
913 return r;
3c1668da
LP
914
915 o->field.hash = htole64(hash);
916 memcpy(o->field.payload, field, size);
917
918 r = journal_file_link_field(f, o, p, hash);
919 if (r < 0)
920 return r;
921
922 /* The linking might have altered the window, so let's
923 * refresh our pointer */
924 r = journal_file_move_to_object(f, OBJECT_FIELD, p, &o);
925 if (r < 0)
926 return r;
927
928#ifdef HAVE_GCRYPT
929 r = journal_file_hmac_put_object(f, OBJECT_FIELD, o, p);
930 if (r < 0)
931 return r;
932#endif
933
934 if (ret)
935 *ret = o;
936
937 if (offset)
938 *offset = p;
939
940 return 0;
941}
942
48496df6
LP
943static int journal_file_append_data(
944 JournalFile *f,
945 const void *data, uint64_t size,
946 Object **ret, uint64_t *offset) {
947
de190aef
LP
948 uint64_t hash, p;
949 uint64_t osize;
950 Object *o;
951 int r;
807e17f0 952 bool compressed = false;
3c1668da 953 const void *eq;
de190aef
LP
954
955 assert(f);
956 assert(data || size == 0);
957
958 hash = hash64(data, size);
959
960 r = journal_file_find_data_object_with_hash(f, data, size, hash, &o, &p);
961 if (r < 0)
962 return r;
963 else if (r > 0) {
964
965 if (ret)
966 *ret = o;
967
968 if (offset)
969 *offset = p;
970
971 return 0;
972 }
973
974 osize = offsetof(Object, data.payload) + size;
975 r = journal_file_append_object(f, OBJECT_DATA, osize, &o, &p);
cec736d2
LP
976 if (r < 0)
977 return r;
978
cec736d2 979 o->data.hash = htole64(hash);
807e17f0
LP
980
981#ifdef HAVE_XZ
982 if (f->compress &&
983 size >= COMPRESSION_SIZE_THRESHOLD) {
984 uint64_t rsize;
985
986 compressed = compress_blob(data, size, o->data.payload, &rsize);
987
988 if (compressed) {
989 o->object.size = htole64(offsetof(Object, data.payload) + rsize);
990 o->object.flags |= OBJECT_COMPRESSED;
991
507f22bd 992 log_debug("Compressed data object %"PRIu64" -> %"PRIu64, size, rsize);
807e17f0
LP
993 }
994 }
995#endif
996
64825d3c 997 if (!compressed && size > 0)
807e17f0 998 memcpy(o->data.payload, data, size);
cec736d2 999
de190aef 1000 r = journal_file_link_data(f, o, p, hash);
cec736d2
LP
1001 if (r < 0)
1002 return r;
1003
48496df6
LP
1004 /* The linking might have altered the window, so let's
1005 * refresh our pointer */
1006 r = journal_file_move_to_object(f, OBJECT_DATA, p, &o);
1007 if (r < 0)
1008 return r;
1009
08c6f819
SL
1010 if (!data)
1011 eq = NULL;
1012 else
1013 eq = memchr(data, '=', size);
3c1668da 1014 if (eq && eq > data) {
748db592 1015 Object *fo = NULL;
3c1668da 1016 uint64_t fp;
3c1668da
LP
1017
1018 /* Create field object ... */
1019 r = journal_file_append_field(f, data, (uint8_t*) eq - (uint8_t*) data, &fo, &fp);
1020 if (r < 0)
1021 return r;
1022
1023 /* ... and link it in. */
1024 o->data.next_field_offset = fo->field.head_data_offset;
1025 fo->field.head_data_offset = le64toh(p);
1026 }
1027
5996c7c2
LP
1028#ifdef HAVE_GCRYPT
1029 r = journal_file_hmac_put_object(f, OBJECT_DATA, o, p);
1030 if (r < 0)
1031 return r;
1032#endif
1033
cec736d2
LP
1034 if (ret)
1035 *ret = o;
1036
1037 if (offset)
de190aef 1038 *offset = p;
cec736d2
LP
1039
1040 return 0;
1041}
1042
1043uint64_t journal_file_entry_n_items(Object *o) {
1044 assert(o);
b588975f
LP
1045
1046 if (o->object.type != OBJECT_ENTRY)
1047 return 0;
cec736d2
LP
1048
1049 return (le64toh(o->object.size) - offsetof(Object, entry.items)) / sizeof(EntryItem);
1050}
1051
0284adc6 1052uint64_t journal_file_entry_array_n_items(Object *o) {
de190aef 1053 assert(o);
b588975f
LP
1054
1055 if (o->object.type != OBJECT_ENTRY_ARRAY)
1056 return 0;
de190aef
LP
1057
1058 return (le64toh(o->object.size) - offsetof(Object, entry_array.items)) / sizeof(uint64_t);
1059}
1060
fb9a24b6
LP
1061uint64_t journal_file_hash_table_n_items(Object *o) {
1062 assert(o);
b588975f
LP
1063
1064 if (o->object.type != OBJECT_DATA_HASH_TABLE &&
1065 o->object.type != OBJECT_FIELD_HASH_TABLE)
1066 return 0;
fb9a24b6
LP
1067
1068 return (le64toh(o->object.size) - offsetof(Object, hash_table.items)) / sizeof(HashItem);
1069}
1070
de190aef 1071static int link_entry_into_array(JournalFile *f,
4fd052ae
FC
1072 le64_t *first,
1073 le64_t *idx,
de190aef 1074 uint64_t p) {
cec736d2 1075 int r;
de190aef
LP
1076 uint64_t n = 0, ap = 0, q, i, a, hidx;
1077 Object *o;
1078
cec736d2 1079 assert(f);
de190aef
LP
1080 assert(first);
1081 assert(idx);
1082 assert(p > 0);
cec736d2 1083
de190aef
LP
1084 a = le64toh(*first);
1085 i = hidx = le64toh(*idx);
1086 while (a > 0) {
1087
1088 r = journal_file_move_to_object(f, OBJECT_ENTRY_ARRAY, a, &o);
1089 if (r < 0)
1090 return r;
cec736d2 1091
de190aef
LP
1092 n = journal_file_entry_array_n_items(o);
1093 if (i < n) {
1094 o->entry_array.items[i] = htole64(p);
1095 *idx = htole64(hidx + 1);
1096 return 0;
1097 }
cec736d2 1098
de190aef
LP
1099 i -= n;
1100 ap = a;
1101 a = le64toh(o->entry_array.next_entry_array_offset);
1102 }
1103
1104 if (hidx > n)
1105 n = (hidx+1) * 2;
1106 else
1107 n = n * 2;
1108
1109 if (n < 4)
1110 n = 4;
1111
1112 r = journal_file_append_object(f, OBJECT_ENTRY_ARRAY,
1113 offsetof(Object, entry_array.items) + n * sizeof(uint64_t),
1114 &o, &q);
cec736d2
LP
1115 if (r < 0)
1116 return r;
1117
feb12d3e 1118#ifdef HAVE_GCRYPT
5996c7c2 1119 r = journal_file_hmac_put_object(f, OBJECT_ENTRY_ARRAY, o, q);
b0af6f41
LP
1120 if (r < 0)
1121 return r;
feb12d3e 1122#endif
b0af6f41 1123
de190aef 1124 o->entry_array.items[i] = htole64(p);
cec736d2 1125
de190aef 1126 if (ap == 0)
7be3aa17 1127 *first = htole64(q);
cec736d2 1128 else {
de190aef 1129 r = journal_file_move_to_object(f, OBJECT_ENTRY_ARRAY, ap, &o);
cec736d2
LP
1130 if (r < 0)
1131 return r;
1132
de190aef
LP
1133 o->entry_array.next_entry_array_offset = htole64(q);
1134 }
cec736d2 1135
2dee23eb
LP
1136 if (JOURNAL_HEADER_CONTAINS(f->header, n_entry_arrays))
1137 f->header->n_entry_arrays = htole64(le64toh(f->header->n_entry_arrays) + 1);
1138
de190aef
LP
1139 *idx = htole64(hidx + 1);
1140
1141 return 0;
1142}
cec736d2 1143
de190aef 1144static int link_entry_into_array_plus_one(JournalFile *f,
4fd052ae
FC
1145 le64_t *extra,
1146 le64_t *first,
1147 le64_t *idx,
de190aef
LP
1148 uint64_t p) {
1149
1150 int r;
1151
1152 assert(f);
1153 assert(extra);
1154 assert(first);
1155 assert(idx);
1156 assert(p > 0);
1157
1158 if (*idx == 0)
1159 *extra = htole64(p);
1160 else {
4fd052ae 1161 le64_t i;
de190aef 1162
7be3aa17 1163 i = htole64(le64toh(*idx) - 1);
de190aef
LP
1164 r = link_entry_into_array(f, first, &i, p);
1165 if (r < 0)
1166 return r;
cec736d2
LP
1167 }
1168
de190aef
LP
1169 *idx = htole64(le64toh(*idx) + 1);
1170 return 0;
1171}
1172
1173static int journal_file_link_entry_item(JournalFile *f, Object *o, uint64_t offset, uint64_t i) {
1174 uint64_t p;
1175 int r;
1176 assert(f);
1177 assert(o);
1178 assert(offset > 0);
1179
1180 p = le64toh(o->entry.items[i].object_offset);
1181 if (p == 0)
1182 return -EINVAL;
1183
1184 r = journal_file_move_to_object(f, OBJECT_DATA, p, &o);
cec736d2
LP
1185 if (r < 0)
1186 return r;
1187
de190aef
LP
1188 return link_entry_into_array_plus_one(f,
1189 &o->data.entry_offset,
1190 &o->data.entry_array_offset,
1191 &o->data.n_entries,
1192 offset);
cec736d2
LP
1193}
1194
1195static int journal_file_link_entry(JournalFile *f, Object *o, uint64_t offset) {
de190aef 1196 uint64_t n, i;
cec736d2
LP
1197 int r;
1198
1199 assert(f);
1200 assert(o);
1201 assert(offset > 0);
b588975f
LP
1202
1203 if (o->object.type != OBJECT_ENTRY)
1204 return -EINVAL;
cec736d2 1205
b788cc23
LP
1206 __sync_synchronize();
1207
cec736d2 1208 /* Link up the entry itself */
de190aef
LP
1209 r = link_entry_into_array(f,
1210 &f->header->entry_array_offset,
1211 &f->header->n_entries,
1212 offset);
1213 if (r < 0)
1214 return r;
cec736d2 1215
507f22bd 1216 /* log_debug("=> %s seqnr=%"PRIu64" n_entries=%"PRIu64, f->path, o->entry.seqnum, f->header->n_entries); */
cec736d2 1217
de190aef 1218 if (f->header->head_entry_realtime == 0)
0ac38b70 1219 f->header->head_entry_realtime = o->entry.realtime;
cec736d2 1220
0ac38b70 1221 f->header->tail_entry_realtime = o->entry.realtime;
de190aef
LP
1222 f->header->tail_entry_monotonic = o->entry.monotonic;
1223
1224 f->tail_entry_monotonic_valid = true;
cec736d2
LP
1225
1226 /* Link up the items */
1227 n = journal_file_entry_n_items(o);
1228 for (i = 0; i < n; i++) {
1229 r = journal_file_link_entry_item(f, o, offset, i);
1230 if (r < 0)
1231 return r;
1232 }
1233
cec736d2
LP
1234 return 0;
1235}
1236
1237static int journal_file_append_entry_internal(
1238 JournalFile *f,
1239 const dual_timestamp *ts,
1240 uint64_t xor_hash,
1241 const EntryItem items[], unsigned n_items,
de190aef 1242 uint64_t *seqnum,
cec736d2
LP
1243 Object **ret, uint64_t *offset) {
1244 uint64_t np;
1245 uint64_t osize;
1246 Object *o;
1247 int r;
1248
1249 assert(f);
1250 assert(items || n_items == 0);
de190aef 1251 assert(ts);
cec736d2
LP
1252
1253 osize = offsetof(Object, entry.items) + (n_items * sizeof(EntryItem));
1254
de190aef 1255 r = journal_file_append_object(f, OBJECT_ENTRY, osize, &o, &np);
cec736d2
LP
1256 if (r < 0)
1257 return r;
1258
d98cc1f2 1259 o->entry.seqnum = htole64(journal_file_entry_seqnum(f, seqnum));
cec736d2 1260 memcpy(o->entry.items, items, n_items * sizeof(EntryItem));
de190aef
LP
1261 o->entry.realtime = htole64(ts->realtime);
1262 o->entry.monotonic = htole64(ts->monotonic);
cec736d2
LP
1263 o->entry.xor_hash = htole64(xor_hash);
1264 o->entry.boot_id = f->header->boot_id;
1265
feb12d3e 1266#ifdef HAVE_GCRYPT
5996c7c2 1267 r = journal_file_hmac_put_object(f, OBJECT_ENTRY, o, np);
b0af6f41
LP
1268 if (r < 0)
1269 return r;
feb12d3e 1270#endif
b0af6f41 1271
cec736d2
LP
1272 r = journal_file_link_entry(f, o, np);
1273 if (r < 0)
1274 return r;
1275
1276 if (ret)
1277 *ret = o;
1278
1279 if (offset)
1280 *offset = np;
1281
1282 return 0;
1283}
1284
cf244689 1285void journal_file_post_change(JournalFile *f) {
50f20cfd
LP
1286 assert(f);
1287
1288 /* inotify() does not receive IN_MODIFY events from file
1289 * accesses done via mmap(). After each access we hence
1290 * trigger IN_MODIFY by truncating the journal file to its
1291 * current size which triggers IN_MODIFY. */
1292
bc85bfee
LP
1293 __sync_synchronize();
1294
50f20cfd 1295 if (ftruncate(f->fd, f->last_stat.st_size) < 0)
c5315881 1296 log_error("Failed to truncate file to its own size: %m");
50f20cfd
LP
1297}
1298
1f2da9ec
LP
1299static int entry_item_cmp(const void *_a, const void *_b) {
1300 const EntryItem *a = _a, *b = _b;
1301
1302 if (le64toh(a->object_offset) < le64toh(b->object_offset))
1303 return -1;
1304 if (le64toh(a->object_offset) > le64toh(b->object_offset))
1305 return 1;
1306 return 0;
1307}
1308
de190aef 1309int journal_file_append_entry(JournalFile *f, const dual_timestamp *ts, const struct iovec iovec[], unsigned n_iovec, uint64_t *seqnum, Object **ret, uint64_t *offset) {
cec736d2
LP
1310 unsigned i;
1311 EntryItem *items;
1312 int r;
1313 uint64_t xor_hash = 0;
de190aef 1314 struct dual_timestamp _ts;
cec736d2
LP
1315
1316 assert(f);
1317 assert(iovec || n_iovec == 0);
1318
de190aef
LP
1319 if (!ts) {
1320 dual_timestamp_get(&_ts);
1321 ts = &_ts;
1322 }
1323
1324 if (f->tail_entry_monotonic_valid &&
1325 ts->monotonic < le64toh(f->header->tail_entry_monotonic))
1326 return -EINVAL;
1327
feb12d3e 1328#ifdef HAVE_GCRYPT
7560fffc
LP
1329 r = journal_file_maybe_append_tag(f, ts->realtime);
1330 if (r < 0)
1331 return r;
feb12d3e 1332#endif
7560fffc 1333
64825d3c 1334 /* alloca() can't take 0, hence let's allocate at least one */
9607d947 1335 items = alloca(sizeof(EntryItem) * MAX(1u, n_iovec));
cec736d2
LP
1336
1337 for (i = 0; i < n_iovec; i++) {
1338 uint64_t p;
1339 Object *o;
1340
1341 r = journal_file_append_data(f, iovec[i].iov_base, iovec[i].iov_len, &o, &p);
1342 if (r < 0)
cf244689 1343 return r;
cec736d2
LP
1344
1345 xor_hash ^= le64toh(o->data.hash);
1346 items[i].object_offset = htole64(p);
de7b95cd 1347 items[i].hash = o->data.hash;
cec736d2
LP
1348 }
1349
1f2da9ec
LP
1350 /* Order by the position on disk, in order to improve seek
1351 * times for rotating media. */
7ff7394d 1352 qsort_safe(items, n_iovec, sizeof(EntryItem), entry_item_cmp);
1f2da9ec 1353
de190aef 1354 r = journal_file_append_entry_internal(f, ts, xor_hash, items, n_iovec, seqnum, ret, offset);
cec736d2 1355
50f20cfd
LP
1356 journal_file_post_change(f);
1357
cec736d2
LP
1358 return r;
1359}
1360
a4bcff5b
LP
1361typedef struct ChainCacheItem {
1362 uint64_t first; /* the array at the begin of the chain */
1363 uint64_t array; /* the cached array */
1364 uint64_t begin; /* the first item in the cached array */
1365 uint64_t total; /* the total number of items in all arrays before this one in the chain */
f268980d 1366 uint64_t last_index; /* the last index we looked at, to optimize locality when bisecting */
a4bcff5b
LP
1367} ChainCacheItem;
1368
1369static void chain_cache_put(
1370 Hashmap *h,
1371 ChainCacheItem *ci,
1372 uint64_t first,
1373 uint64_t array,
1374 uint64_t begin,
f268980d
LP
1375 uint64_t total,
1376 uint64_t last_index) {
a4bcff5b
LP
1377
1378 if (!ci) {
34741aa3
LP
1379 /* If the chain item to cache for this chain is the
1380 * first one it's not worth caching anything */
1381 if (array == first)
1382 return;
1383
a4bcff5b
LP
1384 if (hashmap_size(h) >= CHAIN_CACHE_MAX)
1385 ci = hashmap_steal_first(h);
1386 else {
1387 ci = new(ChainCacheItem, 1);
1388 if (!ci)
1389 return;
1390 }
1391
1392 ci->first = first;
1393
1394 if (hashmap_put(h, &ci->first, ci) < 0) {
1395 free(ci);
1396 return;
1397 }
1398 } else
1399 assert(ci->first == first);
1400
1401 ci->array = array;
1402 ci->begin = begin;
1403 ci->total = total;
f268980d 1404 ci->last_index = last_index;
a4bcff5b
LP
1405}
1406
f268980d
LP
1407static int generic_array_get(
1408 JournalFile *f,
1409 uint64_t first,
1410 uint64_t i,
1411 Object **ret, uint64_t *offset) {
de190aef 1412
cec736d2 1413 Object *o;
a4bcff5b 1414 uint64_t p = 0, a, t = 0;
cec736d2 1415 int r;
a4bcff5b 1416 ChainCacheItem *ci;
cec736d2
LP
1417
1418 assert(f);
1419
de190aef 1420 a = first;
a4bcff5b
LP
1421
1422 /* Try the chain cache first */
1423 ci = hashmap_get(f->chain_cache, &first);
1424 if (ci && i > ci->total) {
1425 a = ci->array;
1426 i -= ci->total;
1427 t = ci->total;
1428 }
1429
de190aef 1430 while (a > 0) {
a4bcff5b 1431 uint64_t k;
cec736d2 1432
de190aef
LP
1433 r = journal_file_move_to_object(f, OBJECT_ENTRY_ARRAY, a, &o);
1434 if (r < 0)
1435 return r;
cec736d2 1436
a4bcff5b
LP
1437 k = journal_file_entry_array_n_items(o);
1438 if (i < k) {
de190aef 1439 p = le64toh(o->entry_array.items[i]);
a4bcff5b 1440 goto found;
cec736d2
LP
1441 }
1442
a4bcff5b
LP
1443 i -= k;
1444 t += k;
de190aef
LP
1445 a = le64toh(o->entry_array.next_entry_array_offset);
1446 }
1447
a4bcff5b
LP
1448 return 0;
1449
1450found:
1451 /* Let's cache this item for the next invocation */
af13a6b0 1452 chain_cache_put(f->chain_cache, ci, first, a, le64toh(o->entry_array.items[0]), t, i);
de190aef
LP
1453
1454 r = journal_file_move_to_object(f, OBJECT_ENTRY, p, &o);
1455 if (r < 0)
1456 return r;
1457
1458 if (ret)
1459 *ret = o;
1460
1461 if (offset)
1462 *offset = p;
1463
1464 return 1;
1465}
1466
f268980d
LP
1467static int generic_array_get_plus_one(
1468 JournalFile *f,
1469 uint64_t extra,
1470 uint64_t first,
1471 uint64_t i,
1472 Object **ret, uint64_t *offset) {
de190aef
LP
1473
1474 Object *o;
1475
1476 assert(f);
1477
1478 if (i == 0) {
1479 int r;
1480
1481 r = journal_file_move_to_object(f, OBJECT_ENTRY, extra, &o);
cec736d2
LP
1482 if (r < 0)
1483 return r;
1484
de190aef
LP
1485 if (ret)
1486 *ret = o;
cec736d2 1487
de190aef
LP
1488 if (offset)
1489 *offset = extra;
cec736d2 1490
de190aef 1491 return 1;
cec736d2
LP
1492 }
1493
de190aef
LP
1494 return generic_array_get(f, first, i-1, ret, offset);
1495}
cec736d2 1496
de190aef
LP
1497enum {
1498 TEST_FOUND,
1499 TEST_LEFT,
1500 TEST_RIGHT
1501};
cec736d2 1502
f268980d
LP
1503static int generic_array_bisect(
1504 JournalFile *f,
1505 uint64_t first,
1506 uint64_t n,
1507 uint64_t needle,
1508 int (*test_object)(JournalFile *f, uint64_t p, uint64_t needle),
1509 direction_t direction,
1510 Object **ret,
1511 uint64_t *offset,
1512 uint64_t *idx) {
1513
1514 uint64_t a, p, t = 0, i = 0, last_p = 0, last_index = (uint64_t) -1;
de190aef
LP
1515 bool subtract_one = false;
1516 Object *o, *array = NULL;
1517 int r;
a4bcff5b 1518 ChainCacheItem *ci;
cec736d2 1519
de190aef
LP
1520 assert(f);
1521 assert(test_object);
cec736d2 1522
a4bcff5b 1523 /* Start with the first array in the chain */
de190aef 1524 a = first;
a4bcff5b
LP
1525
1526 ci = hashmap_get(f->chain_cache, &first);
1527 if (ci && n > ci->total) {
1528 /* Ah, we have iterated this bisection array chain
1529 * previously! Let's see if we can skip ahead in the
1530 * chain, as far as the last time. But we can't jump
1531 * backwards in the chain, so let's check that
1532 * first. */
1533
1534 r = test_object(f, ci->begin, needle);
1535 if (r < 0)
1536 return r;
1537
1538 if (r == TEST_LEFT) {
f268980d 1539 /* OK, what we are looking for is right of the
a4bcff5b
LP
1540 * begin of this EntryArray, so let's jump
1541 * straight to previously cached array in the
1542 * chain */
1543
1544 a = ci->array;
1545 n -= ci->total;
1546 t = ci->total;
f268980d 1547 last_index = ci->last_index;
a4bcff5b
LP
1548 }
1549 }
1550
de190aef
LP
1551 while (a > 0) {
1552 uint64_t left, right, k, lp;
1553
1554 r = journal_file_move_to_object(f, OBJECT_ENTRY_ARRAY, a, &array);
cec736d2
LP
1555 if (r < 0)
1556 return r;
1557
de190aef
LP
1558 k = journal_file_entry_array_n_items(array);
1559 right = MIN(k, n);
1560 if (right <= 0)
1561 return 0;
cec736d2 1562
de190aef
LP
1563 i = right - 1;
1564 lp = p = le64toh(array->entry_array.items[i]);
1565 if (p <= 0)
1566 return -EBADMSG;
cec736d2 1567
de190aef
LP
1568 r = test_object(f, p, needle);
1569 if (r < 0)
1570 return r;
cec736d2 1571
de190aef
LP
1572 if (r == TEST_FOUND)
1573 r = direction == DIRECTION_DOWN ? TEST_RIGHT : TEST_LEFT;
1574
1575 if (r == TEST_RIGHT) {
1576 left = 0;
1577 right -= 1;
f268980d
LP
1578
1579 if (last_index != (uint64_t) -1) {
1580 assert(last_index <= right);
1581
1582 /* If we cached the last index we
1583 * looked at, let's try to not to jump
1584 * too wildly around and see if we can
1585 * limit the range to look at early to
1586 * the immediate neighbors of the last
1587 * index we looked at. */
1588
1589 if (last_index > 0) {
1590 uint64_t x = last_index - 1;
1591
1592 p = le64toh(array->entry_array.items[x]);
1593 if (p <= 0)
1594 return -EBADMSG;
1595
1596 r = test_object(f, p, needle);
1597 if (r < 0)
1598 return r;
1599
1600 if (r == TEST_FOUND)
1601 r = direction == DIRECTION_DOWN ? TEST_RIGHT : TEST_LEFT;
1602
1603 if (r == TEST_RIGHT)
1604 right = x;
1605 else
1606 left = x + 1;
1607 }
1608
1609 if (last_index < right) {
1610 uint64_t y = last_index + 1;
1611
1612 p = le64toh(array->entry_array.items[y]);
1613 if (p <= 0)
1614 return -EBADMSG;
1615
1616 r = test_object(f, p, needle);
1617 if (r < 0)
1618 return r;
1619
1620 if (r == TEST_FOUND)
1621 r = direction == DIRECTION_DOWN ? TEST_RIGHT : TEST_LEFT;
1622
1623 if (r == TEST_RIGHT)
1624 right = y;
1625 else
1626 left = y + 1;
1627 }
f268980d
LP
1628 }
1629
de190aef
LP
1630 for (;;) {
1631 if (left == right) {
1632 if (direction == DIRECTION_UP)
1633 subtract_one = true;
1634
1635 i = left;
1636 goto found;
1637 }
1638
1639 assert(left < right);
de190aef 1640 i = (left + right) / 2;
f268980d 1641
de190aef
LP
1642 p = le64toh(array->entry_array.items[i]);
1643 if (p <= 0)
1644 return -EBADMSG;
1645
1646 r = test_object(f, p, needle);
1647 if (r < 0)
1648 return r;
cec736d2 1649
de190aef
LP
1650 if (r == TEST_FOUND)
1651 r = direction == DIRECTION_DOWN ? TEST_RIGHT : TEST_LEFT;
1652
1653 if (r == TEST_RIGHT)
1654 right = i;
1655 else
1656 left = i + 1;
1657 }
1658 }
1659
cbdca852
LP
1660 if (k > n) {
1661 if (direction == DIRECTION_UP) {
1662 i = n;
1663 subtract_one = true;
1664 goto found;
1665 }
1666
cec736d2 1667 return 0;
cbdca852 1668 }
cec736d2 1669
de190aef
LP
1670 last_p = lp;
1671
1672 n -= k;
1673 t += k;
f268980d 1674 last_index = (uint64_t) -1;
de190aef 1675 a = le64toh(array->entry_array.next_entry_array_offset);
cec736d2
LP
1676 }
1677
1678 return 0;
de190aef
LP
1679
1680found:
1681 if (subtract_one && t == 0 && i == 0)
1682 return 0;
1683
a4bcff5b 1684 /* Let's cache this item for the next invocation */
af13a6b0 1685 chain_cache_put(f->chain_cache, ci, first, a, le64toh(array->entry_array.items[0]), t, subtract_one ? (i > 0 ? i-1 : (uint64_t) -1) : i);
a4bcff5b 1686
de190aef
LP
1687 if (subtract_one && i == 0)
1688 p = last_p;
1689 else if (subtract_one)
1690 p = le64toh(array->entry_array.items[i-1]);
1691 else
1692 p = le64toh(array->entry_array.items[i]);
1693
1694 r = journal_file_move_to_object(f, OBJECT_ENTRY, p, &o);
1695 if (r < 0)
1696 return r;
1697
1698 if (ret)
1699 *ret = o;
1700
1701 if (offset)
1702 *offset = p;
1703
1704 if (idx)
cbdca852 1705 *idx = t + i + (subtract_one ? -1 : 0);
de190aef
LP
1706
1707 return 1;
cec736d2
LP
1708}
1709
f268980d
LP
1710
1711static int generic_array_bisect_plus_one(
1712 JournalFile *f,
1713 uint64_t extra,
1714 uint64_t first,
1715 uint64_t n,
1716 uint64_t needle,
1717 int (*test_object)(JournalFile *f, uint64_t p, uint64_t needle),
1718 direction_t direction,
1719 Object **ret,
1720 uint64_t *offset,
1721 uint64_t *idx) {
de190aef 1722
cec736d2 1723 int r;
cbdca852
LP
1724 bool step_back = false;
1725 Object *o;
cec736d2
LP
1726
1727 assert(f);
de190aef 1728 assert(test_object);
cec736d2 1729
de190aef
LP
1730 if (n <= 0)
1731 return 0;
cec736d2 1732
de190aef
LP
1733 /* This bisects the array in object 'first', but first checks
1734 * an extra */
de190aef
LP
1735 r = test_object(f, extra, needle);
1736 if (r < 0)
1737 return r;
a536e261
LP
1738
1739 if (r == TEST_FOUND)
1740 r = direction == DIRECTION_DOWN ? TEST_RIGHT : TEST_LEFT;
1741
cbdca852
LP
1742 /* if we are looking with DIRECTION_UP then we need to first
1743 see if in the actual array there is a matching entry, and
1744 return the last one of that. But if there isn't any we need
1745 to return this one. Hence remember this, and return it
1746 below. */
1747 if (r == TEST_LEFT)
1748 step_back = direction == DIRECTION_UP;
de190aef 1749
cbdca852
LP
1750 if (r == TEST_RIGHT) {
1751 if (direction == DIRECTION_DOWN)
1752 goto found;
1753 else
1754 return 0;
a536e261 1755 }
cec736d2 1756
de190aef
LP
1757 r = generic_array_bisect(f, first, n-1, needle, test_object, direction, ret, offset, idx);
1758
cbdca852
LP
1759 if (r == 0 && step_back)
1760 goto found;
1761
ecf68b1d 1762 if (r > 0 && idx)
de190aef
LP
1763 (*idx) ++;
1764
1765 return r;
cbdca852
LP
1766
1767found:
1768 r = journal_file_move_to_object(f, OBJECT_ENTRY, extra, &o);
1769 if (r < 0)
1770 return r;
1771
1772 if (ret)
1773 *ret = o;
1774
1775 if (offset)
1776 *offset = extra;
1777
1778 if (idx)
1779 *idx = 0;
1780
1781 return 1;
1782}
1783
44a6b1b6 1784_pure_ static int test_object_offset(JournalFile *f, uint64_t p, uint64_t needle) {
cbdca852
LP
1785 assert(f);
1786 assert(p > 0);
1787
1788 if (p == needle)
1789 return TEST_FOUND;
1790 else if (p < needle)
1791 return TEST_LEFT;
1792 else
1793 return TEST_RIGHT;
1794}
1795
1796int journal_file_move_to_entry_by_offset(
1797 JournalFile *f,
1798 uint64_t p,
1799 direction_t direction,
1800 Object **ret,
1801 uint64_t *offset) {
1802
1803 return generic_array_bisect(f,
1804 le64toh(f->header->entry_array_offset),
1805 le64toh(f->header->n_entries),
1806 p,
1807 test_object_offset,
1808 direction,
1809 ret, offset, NULL);
de190aef
LP
1810}
1811
cbdca852 1812
de190aef
LP
1813static int test_object_seqnum(JournalFile *f, uint64_t p, uint64_t needle) {
1814 Object *o;
1815 int r;
1816
1817 assert(f);
1818 assert(p > 0);
1819
1820 r = journal_file_move_to_object(f, OBJECT_ENTRY, p, &o);
cec736d2
LP
1821 if (r < 0)
1822 return r;
1823
de190aef
LP
1824 if (le64toh(o->entry.seqnum) == needle)
1825 return TEST_FOUND;
1826 else if (le64toh(o->entry.seqnum) < needle)
1827 return TEST_LEFT;
1828 else
1829 return TEST_RIGHT;
1830}
cec736d2 1831
de190aef
LP
1832int journal_file_move_to_entry_by_seqnum(
1833 JournalFile *f,
1834 uint64_t seqnum,
1835 direction_t direction,
1836 Object **ret,
1837 uint64_t *offset) {
1838
1839 return generic_array_bisect(f,
1840 le64toh(f->header->entry_array_offset),
1841 le64toh(f->header->n_entries),
1842 seqnum,
1843 test_object_seqnum,
1844 direction,
1845 ret, offset, NULL);
1846}
cec736d2 1847
de190aef
LP
1848static int test_object_realtime(JournalFile *f, uint64_t p, uint64_t needle) {
1849 Object *o;
1850 int r;
1851
1852 assert(f);
1853 assert(p > 0);
1854
1855 r = journal_file_move_to_object(f, OBJECT_ENTRY, p, &o);
1856 if (r < 0)
1857 return r;
1858
1859 if (le64toh(o->entry.realtime) == needle)
1860 return TEST_FOUND;
1861 else if (le64toh(o->entry.realtime) < needle)
1862 return TEST_LEFT;
1863 else
1864 return TEST_RIGHT;
cec736d2
LP
1865}
1866
de190aef
LP
1867int journal_file_move_to_entry_by_realtime(
1868 JournalFile *f,
1869 uint64_t realtime,
1870 direction_t direction,
1871 Object **ret,
1872 uint64_t *offset) {
1873
1874 return generic_array_bisect(f,
1875 le64toh(f->header->entry_array_offset),
1876 le64toh(f->header->n_entries),
1877 realtime,
1878 test_object_realtime,
1879 direction,
1880 ret, offset, NULL);
1881}
1882
1883static int test_object_monotonic(JournalFile *f, uint64_t p, uint64_t needle) {
1884 Object *o;
1885 int r;
1886
1887 assert(f);
1888 assert(p > 0);
1889
1890 r = journal_file_move_to_object(f, OBJECT_ENTRY, p, &o);
1891 if (r < 0)
1892 return r;
1893
1894 if (le64toh(o->entry.monotonic) == needle)
1895 return TEST_FOUND;
1896 else if (le64toh(o->entry.monotonic) < needle)
1897 return TEST_LEFT;
1898 else
1899 return TEST_RIGHT;
1900}
1901
47838ab3
ZJS
1902static inline int find_data_object_by_boot_id(
1903 JournalFile *f,
1904 sd_id128_t boot_id,
1905 Object **o,
1906 uint64_t *b) {
1907 char t[sizeof("_BOOT_ID=")-1 + 32 + 1] = "_BOOT_ID=";
1908
1909 sd_id128_to_string(boot_id, t + 9);
1910 return journal_file_find_data_object(f, t, sizeof(t) - 1, o, b);
1911}
1912
de190aef
LP
1913int journal_file_move_to_entry_by_monotonic(
1914 JournalFile *f,
1915 sd_id128_t boot_id,
1916 uint64_t monotonic,
1917 direction_t direction,
1918 Object **ret,
1919 uint64_t *offset) {
1920
de190aef
LP
1921 Object *o;
1922 int r;
1923
cbdca852 1924 assert(f);
de190aef 1925
47838ab3 1926 r = find_data_object_by_boot_id(f, boot_id, &o, NULL);
de190aef
LP
1927 if (r < 0)
1928 return r;
cbdca852 1929 if (r == 0)
de190aef
LP
1930 return -ENOENT;
1931
1932 return generic_array_bisect_plus_one(f,
1933 le64toh(o->data.entry_offset),
1934 le64toh(o->data.entry_array_offset),
1935 le64toh(o->data.n_entries),
1936 monotonic,
1937 test_object_monotonic,
1938 direction,
1939 ret, offset, NULL);
1940}
1941
de190aef
LP
1942int journal_file_next_entry(
1943 JournalFile *f,
1944 Object *o, uint64_t p,
1945 direction_t direction,
1946 Object **ret, uint64_t *offset) {
1947
1948 uint64_t i, n;
cec736d2
LP
1949 int r;
1950
1951 assert(f);
de190aef
LP
1952 assert(p > 0 || !o);
1953
1954 n = le64toh(f->header->n_entries);
1955 if (n <= 0)
1956 return 0;
cec736d2
LP
1957
1958 if (!o)
de190aef 1959 i = direction == DIRECTION_DOWN ? 0 : n - 1;
cec736d2 1960 else {
de190aef 1961 if (o->object.type != OBJECT_ENTRY)
cec736d2
LP
1962 return -EINVAL;
1963
de190aef
LP
1964 r = generic_array_bisect(f,
1965 le64toh(f->header->entry_array_offset),
1966 le64toh(f->header->n_entries),
1967 p,
1968 test_object_offset,
1969 DIRECTION_DOWN,
1970 NULL, NULL,
1971 &i);
1972 if (r <= 0)
1973 return r;
1974
1975 if (direction == DIRECTION_DOWN) {
1976 if (i >= n - 1)
1977 return 0;
1978
1979 i++;
1980 } else {
1981 if (i <= 0)
1982 return 0;
1983
1984 i--;
1985 }
cec736d2
LP
1986 }
1987
de190aef
LP
1988 /* And jump to it */
1989 return generic_array_get(f,
1990 le64toh(f->header->entry_array_offset),
1991 i,
1992 ret, offset);
1993}
cec736d2 1994
de190aef
LP
1995int journal_file_skip_entry(
1996 JournalFile *f,
1997 Object *o, uint64_t p,
1998 int64_t skip,
1999 Object **ret, uint64_t *offset) {
2000
2001 uint64_t i, n;
2002 int r;
2003
2004 assert(f);
2005 assert(o);
2006 assert(p > 0);
2007
2008 if (o->object.type != OBJECT_ENTRY)
2009 return -EINVAL;
2010
2011 r = generic_array_bisect(f,
2012 le64toh(f->header->entry_array_offset),
2013 le64toh(f->header->n_entries),
2014 p,
2015 test_object_offset,
2016 DIRECTION_DOWN,
2017 NULL, NULL,
2018 &i);
2019 if (r <= 0)
cec736d2
LP
2020 return r;
2021
de190aef
LP
2022 /* Calculate new index */
2023 if (skip < 0) {
2024 if ((uint64_t) -skip >= i)
2025 i = 0;
2026 else
2027 i = i - (uint64_t) -skip;
2028 } else
2029 i += (uint64_t) skip;
cec736d2 2030
de190aef
LP
2031 n = le64toh(f->header->n_entries);
2032 if (n <= 0)
2033 return -EBADMSG;
cec736d2 2034
de190aef
LP
2035 if (i >= n)
2036 i = n-1;
2037
2038 return generic_array_get(f,
2039 le64toh(f->header->entry_array_offset),
2040 i,
2041 ret, offset);
cec736d2
LP
2042}
2043
de190aef
LP
2044int journal_file_next_entry_for_data(
2045 JournalFile *f,
2046 Object *o, uint64_t p,
2047 uint64_t data_offset,
2048 direction_t direction,
2049 Object **ret, uint64_t *offset) {
2050
2051 uint64_t n, i;
cec736d2 2052 int r;
de190aef 2053 Object *d;
cec736d2
LP
2054
2055 assert(f);
de190aef 2056 assert(p > 0 || !o);
cec736d2 2057
de190aef 2058 r = journal_file_move_to_object(f, OBJECT_DATA, data_offset, &d);
466ccd92 2059 if (r < 0)
de190aef 2060 return r;
cec736d2 2061
de190aef
LP
2062 n = le64toh(d->data.n_entries);
2063 if (n <= 0)
2064 return n;
cec736d2 2065
de190aef
LP
2066 if (!o)
2067 i = direction == DIRECTION_DOWN ? 0 : n - 1;
2068 else {
2069 if (o->object.type != OBJECT_ENTRY)
2070 return -EINVAL;
cec736d2 2071
de190aef
LP
2072 r = generic_array_bisect_plus_one(f,
2073 le64toh(d->data.entry_offset),
2074 le64toh(d->data.entry_array_offset),
2075 le64toh(d->data.n_entries),
2076 p,
2077 test_object_offset,
2078 DIRECTION_DOWN,
2079 NULL, NULL,
2080 &i);
2081
2082 if (r <= 0)
cec736d2
LP
2083 return r;
2084
de190aef
LP
2085 if (direction == DIRECTION_DOWN) {
2086 if (i >= n - 1)
2087 return 0;
cec736d2 2088
de190aef
LP
2089 i++;
2090 } else {
2091 if (i <= 0)
2092 return 0;
cec736d2 2093
de190aef
LP
2094 i--;
2095 }
cec736d2 2096
de190aef 2097 }
cec736d2 2098
de190aef
LP
2099 return generic_array_get_plus_one(f,
2100 le64toh(d->data.entry_offset),
2101 le64toh(d->data.entry_array_offset),
2102 i,
2103 ret, offset);
2104}
cec736d2 2105
cbdca852
LP
2106int journal_file_move_to_entry_by_offset_for_data(
2107 JournalFile *f,
2108 uint64_t data_offset,
2109 uint64_t p,
2110 direction_t direction,
2111 Object **ret, uint64_t *offset) {
2112
2113 int r;
2114 Object *d;
2115
2116 assert(f);
2117
2118 r = journal_file_move_to_object(f, OBJECT_DATA, data_offset, &d);
2119 if (r < 0)
2120 return r;
2121
2122 return generic_array_bisect_plus_one(f,
2123 le64toh(d->data.entry_offset),
2124 le64toh(d->data.entry_array_offset),
2125 le64toh(d->data.n_entries),
2126 p,
2127 test_object_offset,
2128 direction,
2129 ret, offset, NULL);
2130}
2131
2132int journal_file_move_to_entry_by_monotonic_for_data(
2133 JournalFile *f,
2134 uint64_t data_offset,
2135 sd_id128_t boot_id,
2136 uint64_t monotonic,
2137 direction_t direction,
2138 Object **ret, uint64_t *offset) {
2139
cbdca852
LP
2140 Object *o, *d;
2141 int r;
2142 uint64_t b, z;
2143
2144 assert(f);
2145
2146 /* First, seek by time */
47838ab3 2147 r = find_data_object_by_boot_id(f, boot_id, &o, &b);
cbdca852
LP
2148 if (r < 0)
2149 return r;
2150 if (r == 0)
2151 return -ENOENT;
2152
2153 r = generic_array_bisect_plus_one(f,
2154 le64toh(o->data.entry_offset),
2155 le64toh(o->data.entry_array_offset),
2156 le64toh(o->data.n_entries),
2157 monotonic,
2158 test_object_monotonic,
2159 direction,
2160 NULL, &z, NULL);
2161 if (r <= 0)
2162 return r;
2163
2164 /* And now, continue seeking until we find an entry that
2165 * exists in both bisection arrays */
2166
2167 for (;;) {
2168 Object *qo;
2169 uint64_t p, q;
2170
2171 r = journal_file_move_to_object(f, OBJECT_DATA, data_offset, &d);
2172 if (r < 0)
2173 return r;
2174
2175 r = generic_array_bisect_plus_one(f,
2176 le64toh(d->data.entry_offset),
2177 le64toh(d->data.entry_array_offset),
2178 le64toh(d->data.n_entries),
2179 z,
2180 test_object_offset,
2181 direction,
2182 NULL, &p, NULL);
2183 if (r <= 0)
2184 return r;
2185
2186 r = journal_file_move_to_object(f, OBJECT_DATA, b, &o);
2187 if (r < 0)
2188 return r;
2189
2190 r = generic_array_bisect_plus_one(f,
2191 le64toh(o->data.entry_offset),
2192 le64toh(o->data.entry_array_offset),
2193 le64toh(o->data.n_entries),
2194 p,
2195 test_object_offset,
2196 direction,
2197 &qo, &q, NULL);
2198
2199 if (r <= 0)
2200 return r;
2201
2202 if (p == q) {
2203 if (ret)
2204 *ret = qo;
2205 if (offset)
2206 *offset = q;
2207
2208 return 1;
2209 }
2210
2211 z = q;
2212 }
cbdca852
LP
2213}
2214
de190aef
LP
2215int journal_file_move_to_entry_by_seqnum_for_data(
2216 JournalFile *f,
2217 uint64_t data_offset,
2218 uint64_t seqnum,
2219 direction_t direction,
2220 Object **ret, uint64_t *offset) {
cec736d2 2221
de190aef
LP
2222 Object *d;
2223 int r;
cec736d2 2224
91a31dde
LP
2225 assert(f);
2226
de190aef 2227 r = journal_file_move_to_object(f, OBJECT_DATA, data_offset, &d);
91a31dde 2228 if (r < 0)
de190aef 2229 return r;
cec736d2 2230
de190aef
LP
2231 return generic_array_bisect_plus_one(f,
2232 le64toh(d->data.entry_offset),
2233 le64toh(d->data.entry_array_offset),
2234 le64toh(d->data.n_entries),
2235 seqnum,
2236 test_object_seqnum,
2237 direction,
2238 ret, offset, NULL);
2239}
cec736d2 2240
de190aef
LP
2241int journal_file_move_to_entry_by_realtime_for_data(
2242 JournalFile *f,
2243 uint64_t data_offset,
2244 uint64_t realtime,
2245 direction_t direction,
2246 Object **ret, uint64_t *offset) {
2247
2248 Object *d;
2249 int r;
2250
91a31dde
LP
2251 assert(f);
2252
de190aef 2253 r = journal_file_move_to_object(f, OBJECT_DATA, data_offset, &d);
91a31dde 2254 if (r < 0)
de190aef
LP
2255 return r;
2256
2257 return generic_array_bisect_plus_one(f,
2258 le64toh(d->data.entry_offset),
2259 le64toh(d->data.entry_array_offset),
2260 le64toh(d->data.n_entries),
2261 realtime,
2262 test_object_realtime,
2263 direction,
2264 ret, offset, NULL);
cec736d2
LP
2265}
2266
0284adc6 2267void journal_file_dump(JournalFile *f) {
7560fffc 2268 Object *o;
7560fffc 2269 int r;
0284adc6 2270 uint64_t p;
7560fffc
LP
2271
2272 assert(f);
2273
0284adc6 2274 journal_file_print_header(f);
7560fffc 2275
0284adc6
LP
2276 p = le64toh(f->header->header_size);
2277 while (p != 0) {
2278 r = journal_file_move_to_object(f, -1, p, &o);
2279 if (r < 0)
2280 goto fail;
7560fffc 2281
0284adc6 2282 switch (o->object.type) {
d98cc1f2 2283
0284adc6
LP
2284 case OBJECT_UNUSED:
2285 printf("Type: OBJECT_UNUSED\n");
2286 break;
d98cc1f2 2287
0284adc6
LP
2288 case OBJECT_DATA:
2289 printf("Type: OBJECT_DATA\n");
2290 break;
7560fffc 2291
3c1668da
LP
2292 case OBJECT_FIELD:
2293 printf("Type: OBJECT_FIELD\n");
2294 break;
2295
0284adc6 2296 case OBJECT_ENTRY:
507f22bd
ZJS
2297 printf("Type: OBJECT_ENTRY seqnum=%"PRIu64" monotonic=%"PRIu64" realtime=%"PRIu64"\n",
2298 le64toh(o->entry.seqnum),
2299 le64toh(o->entry.monotonic),
2300 le64toh(o->entry.realtime));
0284adc6 2301 break;
7560fffc 2302
0284adc6
LP
2303 case OBJECT_FIELD_HASH_TABLE:
2304 printf("Type: OBJECT_FIELD_HASH_TABLE\n");
2305 break;
7560fffc 2306
0284adc6
LP
2307 case OBJECT_DATA_HASH_TABLE:
2308 printf("Type: OBJECT_DATA_HASH_TABLE\n");
2309 break;
7560fffc 2310
0284adc6
LP
2311 case OBJECT_ENTRY_ARRAY:
2312 printf("Type: OBJECT_ENTRY_ARRAY\n");
2313 break;
7560fffc 2314
0284adc6 2315 case OBJECT_TAG:
507f22bd
ZJS
2316 printf("Type: OBJECT_TAG seqnum=%"PRIu64" epoch=%"PRIu64"\n",
2317 le64toh(o->tag.seqnum),
2318 le64toh(o->tag.epoch));
0284adc6 2319 break;
3c1668da
LP
2320
2321 default:
2322 printf("Type: unknown (%u)\n", o->object.type);
2323 break;
0284adc6 2324 }
7560fffc 2325
0284adc6
LP
2326 if (o->object.flags & OBJECT_COMPRESSED)
2327 printf("Flags: COMPRESSED\n");
7560fffc 2328
0284adc6
LP
2329 if (p == le64toh(f->header->tail_object_offset))
2330 p = 0;
2331 else
2332 p = p + ALIGN64(le64toh(o->object.size));
2333 }
7560fffc 2334
0284adc6
LP
2335 return;
2336fail:
2337 log_error("File corrupt");
7560fffc
LP
2338}
2339
718fe4b1
ZJS
2340static const char* format_timestamp_safe(char *buf, size_t l, usec_t t) {
2341 const char *x;
2342
2343 x = format_timestamp(buf, l, t);
2344 if (x)
2345 return x;
2346 return " --- ";
2347}
2348
0284adc6 2349void journal_file_print_header(JournalFile *f) {
2765b7bb 2350 char a[33], b[33], c[33], d[33];
ed375beb 2351 char x[FORMAT_TIMESTAMP_MAX], y[FORMAT_TIMESTAMP_MAX], z[FORMAT_TIMESTAMP_MAX];
a1a03e30
LP
2352 struct stat st;
2353 char bytes[FORMAT_BYTES_MAX];
7560fffc
LP
2354
2355 assert(f);
7560fffc 2356
0284adc6
LP
2357 printf("File Path: %s\n"
2358 "File ID: %s\n"
2359 "Machine ID: %s\n"
2360 "Boot ID: %s\n"
2361 "Sequential Number ID: %s\n"
2362 "State: %s\n"
2363 "Compatible Flags:%s%s\n"
2364 "Incompatible Flags:%s%s\n"
507f22bd
ZJS
2365 "Header size: %"PRIu64"\n"
2366 "Arena size: %"PRIu64"\n"
2367 "Data Hash Table Size: %"PRIu64"\n"
2368 "Field Hash Table Size: %"PRIu64"\n"
0284adc6 2369 "Rotate Suggested: %s\n"
507f22bd
ZJS
2370 "Head Sequential Number: %"PRIu64"\n"
2371 "Tail Sequential Number: %"PRIu64"\n"
0284adc6 2372 "Head Realtime Timestamp: %s\n"
3223f44f 2373 "Tail Realtime Timestamp: %s\n"
ed375beb 2374 "Tail Monotonic Timestamp: %s\n"
507f22bd
ZJS
2375 "Objects: %"PRIu64"\n"
2376 "Entry Objects: %"PRIu64"\n",
0284adc6
LP
2377 f->path,
2378 sd_id128_to_string(f->header->file_id, a),
2379 sd_id128_to_string(f->header->machine_id, b),
2380 sd_id128_to_string(f->header->boot_id, c),
2765b7bb 2381 sd_id128_to_string(f->header->seqnum_id, d),
3223f44f
LP
2382 f->header->state == STATE_OFFLINE ? "OFFLINE" :
2383 f->header->state == STATE_ONLINE ? "ONLINE" :
2384 f->header->state == STATE_ARCHIVED ? "ARCHIVED" : "UNKNOWN",
8088cbd3
LP
2385 JOURNAL_HEADER_SEALED(f->header) ? " SEALED" : "",
2386 (le32toh(f->header->compatible_flags) & ~HEADER_COMPATIBLE_SEALED) ? " ???" : "",
2387 JOURNAL_HEADER_COMPRESSED(f->header) ? " COMPRESSED" : "",
2388 (le32toh(f->header->incompatible_flags) & ~HEADER_INCOMPATIBLE_COMPRESSED) ? " ???" : "",
507f22bd
ZJS
2389 le64toh(f->header->header_size),
2390 le64toh(f->header->arena_size),
2391 le64toh(f->header->data_hash_table_size) / sizeof(HashItem),
2392 le64toh(f->header->field_hash_table_size) / sizeof(HashItem),
fb0951b0 2393 yes_no(journal_file_rotate_suggested(f, 0)),
507f22bd
ZJS
2394 le64toh(f->header->head_entry_seqnum),
2395 le64toh(f->header->tail_entry_seqnum),
718fe4b1
ZJS
2396 format_timestamp_safe(x, sizeof(x), le64toh(f->header->head_entry_realtime)),
2397 format_timestamp_safe(y, sizeof(y), le64toh(f->header->tail_entry_realtime)),
ed375beb 2398 format_timespan(z, sizeof(z), le64toh(f->header->tail_entry_monotonic), USEC_PER_MSEC),
507f22bd
ZJS
2399 le64toh(f->header->n_objects),
2400 le64toh(f->header->n_entries));
7560fffc 2401
0284adc6 2402 if (JOURNAL_HEADER_CONTAINS(f->header, n_data))
507f22bd 2403 printf("Data Objects: %"PRIu64"\n"
0284adc6 2404 "Data Hash Table Fill: %.1f%%\n",
507f22bd 2405 le64toh(f->header->n_data),
0284adc6 2406 100.0 * (double) le64toh(f->header->n_data) / ((double) (le64toh(f->header->data_hash_table_size) / sizeof(HashItem))));
7560fffc 2407
0284adc6 2408 if (JOURNAL_HEADER_CONTAINS(f->header, n_fields))
507f22bd 2409 printf("Field Objects: %"PRIu64"\n"
0284adc6 2410 "Field Hash Table Fill: %.1f%%\n",
507f22bd 2411 le64toh(f->header->n_fields),
0284adc6 2412 100.0 * (double) le64toh(f->header->n_fields) / ((double) (le64toh(f->header->field_hash_table_size) / sizeof(HashItem))));
3223f44f
LP
2413
2414 if (JOURNAL_HEADER_CONTAINS(f->header, n_tags))
507f22bd
ZJS
2415 printf("Tag Objects: %"PRIu64"\n",
2416 le64toh(f->header->n_tags));
3223f44f 2417 if (JOURNAL_HEADER_CONTAINS(f->header, n_entry_arrays))
507f22bd
ZJS
2418 printf("Entry Array Objects: %"PRIu64"\n",
2419 le64toh(f->header->n_entry_arrays));
a1a03e30
LP
2420
2421 if (fstat(f->fd, &st) >= 0)
2422 printf("Disk usage: %s\n", format_bytes(bytes, sizeof(bytes), (off_t) st.st_blocks * 512ULL));
7560fffc
LP
2423}
2424
0284adc6
LP
2425int journal_file_open(
2426 const char *fname,
2427 int flags,
2428 mode_t mode,
2429 bool compress,
baed47c3 2430 bool seal,
0284adc6
LP
2431 JournalMetrics *metrics,
2432 MMapCache *mmap_cache,
2433 JournalFile *template,
2434 JournalFile **ret) {
7560fffc 2435
0284adc6
LP
2436 JournalFile *f;
2437 int r;
2438 bool newly_created = false;
7560fffc 2439
0284adc6 2440 assert(fname);
0559d3a5 2441 assert(ret);
7560fffc 2442
0284adc6
LP
2443 if ((flags & O_ACCMODE) != O_RDONLY &&
2444 (flags & O_ACCMODE) != O_RDWR)
2445 return -EINVAL;
7560fffc 2446
a0108012
LP
2447 if (!endswith(fname, ".journal") &&
2448 !endswith(fname, ".journal~"))
0284adc6 2449 return -EINVAL;
7560fffc 2450
0284adc6
LP
2451 f = new0(JournalFile, 1);
2452 if (!f)
2453 return -ENOMEM;
7560fffc 2454
0284adc6
LP
2455 f->fd = -1;
2456 f->mode = mode;
7560fffc 2457
0284adc6
LP
2458 f->flags = flags;
2459 f->prot = prot_from_flags(flags);
2460 f->writable = (flags & O_ACCMODE) != O_RDONLY;
48b61739 2461#ifdef HAVE_XZ
0284adc6 2462 f->compress = compress;
48b61739 2463#endif
49a32d43 2464#ifdef HAVE_GCRYPT
baed47c3 2465 f->seal = seal;
49a32d43 2466#endif
7560fffc 2467
0284adc6
LP
2468 if (mmap_cache)
2469 f->mmap = mmap_cache_ref(mmap_cache);
2470 else {
84168d80 2471 f->mmap = mmap_cache_new();
0284adc6
LP
2472 if (!f->mmap) {
2473 r = -ENOMEM;
2474 goto fail;
2475 }
2476 }
7560fffc 2477
0284adc6
LP
2478 f->path = strdup(fname);
2479 if (!f->path) {
2480 r = -ENOMEM;
2481 goto fail;
2482 }
7560fffc 2483
a4bcff5b
LP
2484 f->chain_cache = hashmap_new(uint64_hash_func, uint64_compare_func);
2485 if (!f->chain_cache) {
2486 r = -ENOMEM;
2487 goto fail;
2488 }
2489
0284adc6
LP
2490 f->fd = open(f->path, f->flags|O_CLOEXEC, f->mode);
2491 if (f->fd < 0) {
2492 r = -errno;
2493 goto fail;
7560fffc 2494 }
7560fffc 2495
0284adc6
LP
2496 if (fstat(f->fd, &f->last_stat) < 0) {
2497 r = -errno;
2498 goto fail;
2499 }
7560fffc 2500
0284adc6 2501 if (f->last_stat.st_size == 0 && f->writable) {
fb0951b0
LP
2502#ifdef HAVE_XATTR
2503 uint64_t crtime;
2504
2505 /* Let's attach the creation time to the journal file,
2506 * so that the vacuuming code knows the age of this
2507 * file even if the file might end up corrupted one
2508 * day... Ideally we'd just use the creation time many
2509 * file systems maintain for each file, but there is
2510 * currently no usable API to query this, hence let's
2511 * emulate this via extended attributes. If extended
2512 * attributes are not supported we'll just skip this,
2513 * and rely solely on mtime/atime/ctime of the file.*/
2514
2515 crtime = htole64((uint64_t) now(CLOCK_REALTIME));
2516 fsetxattr(f->fd, "user.crtime_usec", &crtime, sizeof(crtime), XATTR_CREATE);
2517#endif
7560fffc 2518
feb12d3e 2519#ifdef HAVE_GCRYPT
0284adc6 2520 /* Try to load the FSPRG state, and if we can't, then
baed47c3 2521 * just don't do sealing */
49a32d43
LP
2522 if (f->seal) {
2523 r = journal_file_fss_load(f);
2524 if (r < 0)
2525 f->seal = false;
2526 }
feb12d3e 2527#endif
7560fffc 2528
0284adc6
LP
2529 r = journal_file_init_header(f, template);
2530 if (r < 0)
2531 goto fail;
7560fffc 2532
0284adc6
LP
2533 if (fstat(f->fd, &f->last_stat) < 0) {
2534 r = -errno;
2535 goto fail;
2536 }
fb0951b0
LP
2537
2538 newly_created = true;
0284adc6 2539 }
7560fffc 2540
0284adc6
LP
2541 if (f->last_stat.st_size < (off_t) HEADER_SIZE_MIN) {
2542 r = -EIO;
2543 goto fail;
2544 }
7560fffc 2545
0284adc6
LP
2546 f->header = mmap(NULL, PAGE_ALIGN(sizeof(Header)), prot_from_flags(flags), MAP_SHARED, f->fd, 0);
2547 if (f->header == MAP_FAILED) {
2548 f->header = NULL;
2549 r = -errno;
2550 goto fail;
2551 }
7560fffc 2552
0284adc6
LP
2553 if (!newly_created) {
2554 r = journal_file_verify_header(f);
2555 if (r < 0)
2556 goto fail;
2557 }
7560fffc 2558
feb12d3e 2559#ifdef HAVE_GCRYPT
0284adc6 2560 if (!newly_created && f->writable) {
baed47c3 2561 r = journal_file_fss_load(f);
0284adc6
LP
2562 if (r < 0)
2563 goto fail;
2564 }
feb12d3e 2565#endif
cec736d2
LP
2566
2567 if (f->writable) {
4a92baf3
LP
2568 if (metrics) {
2569 journal_default_metrics(metrics, f->fd);
2570 f->metrics = *metrics;
2571 } else if (template)
2572 f->metrics = template->metrics;
2573
cec736d2
LP
2574 r = journal_file_refresh_header(f);
2575 if (r < 0)
2576 goto fail;
2577 }
2578
feb12d3e 2579#ifdef HAVE_GCRYPT
baed47c3 2580 r = journal_file_hmac_setup(f);
14d10188
LP
2581 if (r < 0)
2582 goto fail;
feb12d3e 2583#endif
14d10188 2584
cec736d2 2585 if (newly_created) {
de190aef 2586 r = journal_file_setup_field_hash_table(f);
cec736d2
LP
2587 if (r < 0)
2588 goto fail;
2589
de190aef 2590 r = journal_file_setup_data_hash_table(f);
cec736d2
LP
2591 if (r < 0)
2592 goto fail;
7560fffc 2593
feb12d3e 2594#ifdef HAVE_GCRYPT
7560fffc
LP
2595 r = journal_file_append_first_tag(f);
2596 if (r < 0)
2597 goto fail;
feb12d3e 2598#endif
cec736d2
LP
2599 }
2600
de190aef 2601 r = journal_file_map_field_hash_table(f);
cec736d2
LP
2602 if (r < 0)
2603 goto fail;
2604
de190aef 2605 r = journal_file_map_data_hash_table(f);
cec736d2
LP
2606 if (r < 0)
2607 goto fail;
2608
0559d3a5 2609 *ret = f;
cec736d2
LP
2610 return 0;
2611
2612fail:
2613 journal_file_close(f);
2614
2615 return r;
2616}
0ac38b70 2617
baed47c3 2618int journal_file_rotate(JournalFile **f, bool compress, bool seal) {
57535f47 2619 _cleanup_free_ char *p = NULL;
0ac38b70
LP
2620 size_t l;
2621 JournalFile *old_file, *new_file = NULL;
2622 int r;
2623
2624 assert(f);
2625 assert(*f);
2626
2627 old_file = *f;
2628
2629 if (!old_file->writable)
2630 return -EINVAL;
2631
2632 if (!endswith(old_file->path, ".journal"))
2633 return -EINVAL;
2634
2635 l = strlen(old_file->path);
57535f47
ZJS
2636 r = asprintf(&p, "%.*s@" SD_ID128_FORMAT_STR "-%016"PRIx64"-%016"PRIx64".journal",
2637 (int) l - 8, old_file->path,
2638 SD_ID128_FORMAT_VAL(old_file->header->seqnum_id),
2639 le64toh((*f)->header->head_entry_seqnum),
2640 le64toh((*f)->header->head_entry_realtime));
2641 if (r < 0)
0ac38b70
LP
2642 return -ENOMEM;
2643
0ac38b70 2644 r = rename(old_file->path, p);
0ac38b70
LP
2645 if (r < 0)
2646 return -errno;
2647
ccdbaf91 2648 old_file->header->state = STATE_ARCHIVED;
0ac38b70 2649
baed47c3 2650 r = journal_file_open(old_file->path, old_file->flags, old_file->mode, compress, seal, NULL, old_file->mmap, old_file, &new_file);
0ac38b70
LP
2651 journal_file_close(old_file);
2652
2653 *f = new_file;
2654 return r;
2655}
2656
9447a7f1
LP
2657int journal_file_open_reliably(
2658 const char *fname,
2659 int flags,
2660 mode_t mode,
7560fffc 2661 bool compress,
baed47c3 2662 bool seal,
4a92baf3 2663 JournalMetrics *metrics,
27370278 2664 MMapCache *mmap_cache,
9447a7f1
LP
2665 JournalFile *template,
2666 JournalFile **ret) {
2667
2668 int r;
2669 size_t l;
ed375beb 2670 _cleanup_free_ char *p = NULL;
9447a7f1 2671
baed47c3 2672 r = journal_file_open(fname, flags, mode, compress, seal,
27370278 2673 metrics, mmap_cache, template, ret);
0071d9f1
LP
2674 if (r != -EBADMSG && /* corrupted */
2675 r != -ENODATA && /* truncated */
2676 r != -EHOSTDOWN && /* other machine */
a1a1898f
LP
2677 r != -EPROTONOSUPPORT && /* incompatible feature */
2678 r != -EBUSY && /* unclean shutdown */
2679 r != -ESHUTDOWN /* already archived */)
9447a7f1
LP
2680 return r;
2681
2682 if ((flags & O_ACCMODE) == O_RDONLY)
2683 return r;
2684
2685 if (!(flags & O_CREAT))
2686 return r;
2687
7560fffc
LP
2688 if (!endswith(fname, ".journal"))
2689 return r;
2690
5c70eab4
LP
2691 /* The file is corrupted. Rotate it away and try it again (but only once) */
2692
9447a7f1 2693 l = strlen(fname);
9bf3b535 2694 if (asprintf(&p, "%.*s@%016llx-%016" PRIx64 ".journal~",
57535f47 2695 (int) l - 8, fname,
9447a7f1 2696 (unsigned long long) now(CLOCK_REALTIME),
9bf3b535 2697 random_u64()) < 0)
9447a7f1
LP
2698 return -ENOMEM;
2699
2700 r = rename(fname, p);
9447a7f1
LP
2701 if (r < 0)
2702 return -errno;
2703
a1a1898f 2704 log_warning("File %s corrupted or uncleanly shut down, renaming and replacing.", fname);
9447a7f1 2705
baed47c3 2706 return journal_file_open(fname, flags, mode, compress, seal,
27370278 2707 metrics, mmap_cache, template, ret);
9447a7f1
LP
2708}
2709
cf244689
LP
2710int journal_file_copy_entry(JournalFile *from, JournalFile *to, Object *o, uint64_t p, uint64_t *seqnum, Object **ret, uint64_t *offset) {
2711 uint64_t i, n;
2712 uint64_t q, xor_hash = 0;
2713 int r;
2714 EntryItem *items;
2715 dual_timestamp ts;
2716
2717 assert(from);
2718 assert(to);
2719 assert(o);
2720 assert(p);
2721
2722 if (!to->writable)
2723 return -EPERM;
2724
2725 ts.monotonic = le64toh(o->entry.monotonic);
2726 ts.realtime = le64toh(o->entry.realtime);
2727
cf244689 2728 n = journal_file_entry_n_items(o);
4faa7004
TA
2729 /* alloca() can't take 0, hence let's allocate at least one */
2730 items = alloca(sizeof(EntryItem) * MAX(1u, n));
cf244689
LP
2731
2732 for (i = 0; i < n; i++) {
4fd052ae
FC
2733 uint64_t l, h;
2734 le64_t le_hash;
cf244689
LP
2735 size_t t;
2736 void *data;
2737 Object *u;
2738
2739 q = le64toh(o->entry.items[i].object_offset);
2740 le_hash = o->entry.items[i].hash;
2741
2742 r = journal_file_move_to_object(from, OBJECT_DATA, q, &o);
2743 if (r < 0)
2744 return r;
2745
2746 if (le_hash != o->data.hash)
2747 return -EBADMSG;
2748
2749 l = le64toh(o->object.size) - offsetof(Object, data.payload);
2750 t = (size_t) l;
2751
2752 /* We hit the limit on 32bit machines */
2753 if ((uint64_t) t != l)
2754 return -E2BIG;
2755
2756 if (o->object.flags & OBJECT_COMPRESSED) {
2757#ifdef HAVE_XZ
2758 uint64_t rsize;
2759
93b73b06 2760 if (!uncompress_blob(o->data.payload, l, &from->compress_buffer, &from->compress_buffer_size, &rsize, 0))
cf244689
LP
2761 return -EBADMSG;
2762
2763 data = from->compress_buffer;
2764 l = rsize;
2765#else
2766 return -EPROTONOSUPPORT;
2767#endif
2768 } else
2769 data = o->data.payload;
2770
2771 r = journal_file_append_data(to, data, l, &u, &h);
2772 if (r < 0)
2773 return r;
2774
2775 xor_hash ^= le64toh(u->data.hash);
2776 items[i].object_offset = htole64(h);
2777 items[i].hash = u->data.hash;
2778
2779 r = journal_file_move_to_object(from, OBJECT_ENTRY, p, &o);
2780 if (r < 0)
2781 return r;
2782 }
2783
2784 return journal_file_append_entry_internal(to, &ts, xor_hash, items, n, seqnum, ret, offset);
2785}
babfc091
LP
2786
2787void journal_default_metrics(JournalMetrics *m, int fd) {
2788 uint64_t fs_size = 0;
2789 struct statvfs ss;
a7bc2c2a 2790 char a[FORMAT_BYTES_MAX], b[FORMAT_BYTES_MAX], c[FORMAT_BYTES_MAX], d[FORMAT_BYTES_MAX];
babfc091
LP
2791
2792 assert(m);
2793 assert(fd >= 0);
2794
2795 if (fstatvfs(fd, &ss) >= 0)
2796 fs_size = ss.f_frsize * ss.f_blocks;
2797
2798 if (m->max_use == (uint64_t) -1) {
2799
2800 if (fs_size > 0) {
2801 m->max_use = PAGE_ALIGN(fs_size / 10); /* 10% of file system size */
2802
2803 if (m->max_use > DEFAULT_MAX_USE_UPPER)
2804 m->max_use = DEFAULT_MAX_USE_UPPER;
2805
2806 if (m->max_use < DEFAULT_MAX_USE_LOWER)
2807 m->max_use = DEFAULT_MAX_USE_LOWER;
2808 } else
2809 m->max_use = DEFAULT_MAX_USE_LOWER;
2810 } else {
2811 m->max_use = PAGE_ALIGN(m->max_use);
2812
2813 if (m->max_use < JOURNAL_FILE_SIZE_MIN*2)
2814 m->max_use = JOURNAL_FILE_SIZE_MIN*2;
2815 }
2816
2817 if (m->max_size == (uint64_t) -1) {
2818 m->max_size = PAGE_ALIGN(m->max_use / 8); /* 8 chunks */
2819
2820 if (m->max_size > DEFAULT_MAX_SIZE_UPPER)
2821 m->max_size = DEFAULT_MAX_SIZE_UPPER;
2822 } else
2823 m->max_size = PAGE_ALIGN(m->max_size);
2824
2825 if (m->max_size < JOURNAL_FILE_SIZE_MIN)
2826 m->max_size = JOURNAL_FILE_SIZE_MIN;
2827
2828 if (m->max_size*2 > m->max_use)
2829 m->max_use = m->max_size*2;
2830
2831 if (m->min_size == (uint64_t) -1)
2832 m->min_size = JOURNAL_FILE_SIZE_MIN;
2833 else {
2834 m->min_size = PAGE_ALIGN(m->min_size);
2835
2836 if (m->min_size < JOURNAL_FILE_SIZE_MIN)
2837 m->min_size = JOURNAL_FILE_SIZE_MIN;
2838
2839 if (m->min_size > m->max_size)
2840 m->max_size = m->min_size;
2841 }
2842
2843 if (m->keep_free == (uint64_t) -1) {
2844
2845 if (fs_size > 0) {
8621b110 2846 m->keep_free = PAGE_ALIGN(fs_size * 3 / 20); /* 15% of file system size */
babfc091
LP
2847
2848 if (m->keep_free > DEFAULT_KEEP_FREE_UPPER)
2849 m->keep_free = DEFAULT_KEEP_FREE_UPPER;
2850
2851 } else
2852 m->keep_free = DEFAULT_KEEP_FREE;
2853 }
2854
2b43f939
LP
2855 log_debug("Fixed max_use=%s max_size=%s min_size=%s keep_free=%s",
2856 format_bytes(a, sizeof(a), m->max_use),
2857 format_bytes(b, sizeof(b), m->max_size),
2858 format_bytes(c, sizeof(c), m->min_size),
2859 format_bytes(d, sizeof(d), m->keep_free));
babfc091 2860}
08984293
LP
2861
2862int journal_file_get_cutoff_realtime_usec(JournalFile *f, usec_t *from, usec_t *to) {
08984293
LP
2863 assert(f);
2864 assert(from || to);
2865
2866 if (from) {
162566a4
LP
2867 if (f->header->head_entry_realtime == 0)
2868 return -ENOENT;
08984293 2869
162566a4 2870 *from = le64toh(f->header->head_entry_realtime);
08984293
LP
2871 }
2872
2873 if (to) {
162566a4
LP
2874 if (f->header->tail_entry_realtime == 0)
2875 return -ENOENT;
08984293 2876
162566a4 2877 *to = le64toh(f->header->tail_entry_realtime);
08984293
LP
2878 }
2879
2880 return 1;
2881}
2882
2883int journal_file_get_cutoff_monotonic_usec(JournalFile *f, sd_id128_t boot_id, usec_t *from, usec_t *to) {
08984293
LP
2884 Object *o;
2885 uint64_t p;
2886 int r;
2887
2888 assert(f);
2889 assert(from || to);
2890
47838ab3 2891 r = find_data_object_by_boot_id(f, boot_id, &o, &p);
08984293
LP
2892 if (r <= 0)
2893 return r;
2894
2895 if (le64toh(o->data.n_entries) <= 0)
2896 return 0;
2897
2898 if (from) {
2899 r = journal_file_move_to_object(f, OBJECT_ENTRY, le64toh(o->data.entry_offset), &o);
2900 if (r < 0)
2901 return r;
2902
2903 *from = le64toh(o->entry.monotonic);
2904 }
2905
2906 if (to) {
2907 r = journal_file_move_to_object(f, OBJECT_DATA, p, &o);
2908 if (r < 0)
2909 return r;
2910
2911 r = generic_array_get_plus_one(f,
2912 le64toh(o->data.entry_offset),
2913 le64toh(o->data.entry_array_offset),
2914 le64toh(o->data.n_entries)-1,
2915 &o, NULL);
2916 if (r <= 0)
2917 return r;
2918
2919 *to = le64toh(o->entry.monotonic);
2920 }
2921
2922 return 1;
2923}
dca6219e 2924
fb0951b0 2925bool journal_file_rotate_suggested(JournalFile *f, usec_t max_file_usec) {
dca6219e
LP
2926 assert(f);
2927
2928 /* If we gained new header fields we gained new features,
2929 * hence suggest a rotation */
361f9cbc
LP
2930 if (le64toh(f->header->header_size) < sizeof(Header)) {
2931 log_debug("%s uses an outdated header, suggesting rotation.", f->path);
dca6219e 2932 return true;
361f9cbc 2933 }
dca6219e
LP
2934
2935 /* Let's check if the hash tables grew over a certain fill
2936 * level (75%, borrowing this value from Java's hash table
2937 * implementation), and if so suggest a rotation. To calculate
2938 * the fill level we need the n_data field, which only exists
2939 * in newer versions. */
2940
2941 if (JOURNAL_HEADER_CONTAINS(f->header, n_data))
361f9cbc 2942 if (le64toh(f->header->n_data) * 4ULL > (le64toh(f->header->data_hash_table_size) / sizeof(HashItem)) * 3ULL) {
507f22bd 2943 log_debug("Data hash table of %s has a fill level at %.1f (%"PRIu64" of %"PRIu64" items, %llu file size, %"PRIu64" bytes per hash table item), suggesting rotation.",
361f9cbc
LP
2944 f->path,
2945 100.0 * (double) le64toh(f->header->n_data) / ((double) (le64toh(f->header->data_hash_table_size) / sizeof(HashItem))),
507f22bd
ZJS
2946 le64toh(f->header->n_data),
2947 le64toh(f->header->data_hash_table_size) / sizeof(HashItem),
2948 (unsigned long long) f->last_stat.st_size,
2949 f->last_stat.st_size / le64toh(f->header->n_data));
dca6219e 2950 return true;
361f9cbc 2951 }
dca6219e
LP
2952
2953 if (JOURNAL_HEADER_CONTAINS(f->header, n_fields))
361f9cbc 2954 if (le64toh(f->header->n_fields) * 4ULL > (le64toh(f->header->field_hash_table_size) / sizeof(HashItem)) * 3ULL) {
507f22bd 2955 log_debug("Field hash table of %s has a fill level at %.1f (%"PRIu64" of %"PRIu64" items), suggesting rotation.",
361f9cbc
LP
2956 f->path,
2957 100.0 * (double) le64toh(f->header->n_fields) / ((double) (le64toh(f->header->field_hash_table_size) / sizeof(HashItem))),
507f22bd
ZJS
2958 le64toh(f->header->n_fields),
2959 le64toh(f->header->field_hash_table_size) / sizeof(HashItem));
dca6219e 2960 return true;
361f9cbc 2961 }
dca6219e 2962
0598fd4a
LP
2963 /* Are the data objects properly indexed by field objects? */
2964 if (JOURNAL_HEADER_CONTAINS(f->header, n_data) &&
2965 JOURNAL_HEADER_CONTAINS(f->header, n_fields) &&
2966 le64toh(f->header->n_data) > 0 &&
2967 le64toh(f->header->n_fields) == 0)
2968 return true;
2969
fb0951b0
LP
2970 if (max_file_usec > 0) {
2971 usec_t t, h;
2972
2973 h = le64toh(f->header->head_entry_realtime);
2974 t = now(CLOCK_REALTIME);
2975
2976 if (h > 0 && t > h + max_file_usec)
2977 return true;
2978 }
2979
dca6219e
LP
2980 return false;
2981}