]> git.ipfire.org Git - thirdparty/systemd.git/blame - src/journal/journal-file.c
journal/compress: return early in uncompress_startswith
[thirdparty/systemd.git] / src / journal / journal-file.c
CommitLineData
cec736d2
LP
1/*-*- Mode: C; c-basic-offset: 8; indent-tabs-mode: nil -*-*/
2
3/***
4 This file is part of systemd.
5
6 Copyright 2011 Lennart Poettering
7
8 systemd is free software; you can redistribute it and/or modify it
5430f7f2
LP
9 under the terms of the GNU Lesser General Public License as published by
10 the Free Software Foundation; either version 2.1 of the License, or
cec736d2
LP
11 (at your option) any later version.
12
13 systemd is distributed in the hope that it will be useful, but
14 WITHOUT ANY WARRANTY; without even the implied warranty of
15 MERCHANTABILITY or FITNESS FOR A PARTICULAR PURPOSE. See the GNU
5430f7f2 16 Lesser General Public License for more details.
cec736d2 17
5430f7f2 18 You should have received a copy of the GNU Lesser General Public License
cec736d2
LP
19 along with systemd; If not, see <http://www.gnu.org/licenses/>.
20***/
21
22#include <sys/mman.h>
23#include <errno.h>
24#include <sys/uio.h>
25#include <unistd.h>
26#include <sys/statvfs.h>
27#include <fcntl.h>
28#include <stddef.h>
d2edfae0 29#include <sys/xattr.h>
fb0951b0 30
cec736d2
LP
31#include "journal-def.h"
32#include "journal-file.h"
0284adc6 33#include "journal-authenticate.h"
cec736d2 34#include "lookup3.h"
807e17f0 35#include "compress.h"
7560fffc 36#include "fsprg.h"
cec736d2 37
4a92baf3
LP
38#define DEFAULT_DATA_HASH_TABLE_SIZE (2047ULL*sizeof(HashItem))
39#define DEFAULT_FIELD_HASH_TABLE_SIZE (333ULL*sizeof(HashItem))
cec736d2 40
be19b7df 41#define COMPRESSION_SIZE_THRESHOLD (512ULL)
807e17f0 42
babfc091 43/* This is the minimum journal file size */
253f59df 44#define JOURNAL_FILE_SIZE_MIN (4ULL*1024ULL*1024ULL) /* 4 MiB */
babfc091
LP
45
46/* These are the lower and upper bounds if we deduce the max_use value
47 * from the file system size */
48#define DEFAULT_MAX_USE_LOWER (1ULL*1024ULL*1024ULL) /* 1 MiB */
49#define DEFAULT_MAX_USE_UPPER (4ULL*1024ULL*1024ULL*1024ULL) /* 4 GiB */
50
51/* This is the upper bound if we deduce max_size from max_use */
71100051 52#define DEFAULT_MAX_SIZE_UPPER (128ULL*1024ULL*1024ULL) /* 128 MiB */
babfc091
LP
53
54/* This is the upper bound if we deduce the keep_free value from the
55 * file system size */
56#define DEFAULT_KEEP_FREE_UPPER (4ULL*1024ULL*1024ULL*1024ULL) /* 4 GiB */
57
58/* This is the keep_free value when we can't determine the system
59 * size */
60#define DEFAULT_KEEP_FREE (1024ULL*1024ULL) /* 1 MB */
61
dca6219e
LP
62/* n_data was the first entry we added after the initial file format design */
63#define HEADER_SIZE_MIN ALIGN64(offsetof(Header, n_data))
cec736d2 64
a4bcff5b
LP
65/* How many entries to keep in the entry array chain cache at max */
66#define CHAIN_CACHE_MAX 20
67
a676e665
LP
68/* How much to increase the journal file size at once each time we allocate something new. */
69#define FILE_SIZE_INCREASE (8ULL*1024ULL*1024ULL) /* 8MB */
70
9588bc32 71static int journal_file_set_online(JournalFile *f) {
26687bf8
OS
72 assert(f);
73
74 if (!f->writable)
75 return -EPERM;
76
77 if (!(f->fd >= 0 && f->header))
78 return -EINVAL;
79
80 switch(f->header->state) {
81 case STATE_ONLINE:
82 return 0;
83
84 case STATE_OFFLINE:
85 f->header->state = STATE_ONLINE;
86 fsync(f->fd);
87 return 0;
88
89 default:
90 return -EINVAL;
91 }
92}
93
94int journal_file_set_offline(JournalFile *f) {
95 assert(f);
96
97 if (!f->writable)
98 return -EPERM;
99
100 if (!(f->fd >= 0 && f->header))
101 return -EINVAL;
102
103 if (f->header->state != STATE_ONLINE)
104 return 0;
105
106 fsync(f->fd);
107
108 f->header->state = STATE_OFFLINE;
109
110 fsync(f->fd);
111
112 return 0;
113}
114
cec736d2 115void journal_file_close(JournalFile *f) {
de190aef 116 assert(f);
cec736d2 117
feb12d3e 118#ifdef HAVE_GCRYPT
b0af6f41 119 /* Write the final tag */
c586dbf1 120 if (f->seal && f->writable)
b0af6f41 121 journal_file_append_tag(f);
feb12d3e 122#endif
b0af6f41 123
7560fffc 124 /* Sync everything to disk, before we mark the file offline */
16e9f408
LP
125 if (f->mmap && f->fd >= 0)
126 mmap_cache_close_fd(f->mmap, f->fd);
7560fffc 127
26687bf8 128 journal_file_set_offline(f);
cec736d2 129
26687bf8 130 if (f->header)
d384c7a8 131 munmap(f->header, PAGE_ALIGN(sizeof(Header)));
cec736d2 132
03e334a1 133 safe_close(f->fd);
cec736d2 134 free(f->path);
807e17f0 135
16e9f408
LP
136 if (f->mmap)
137 mmap_cache_unref(f->mmap);
138
a4bcff5b
LP
139 hashmap_free_free(f->chain_cache);
140
807e17f0
LP
141#ifdef HAVE_XZ
142 free(f->compress_buffer);
143#endif
144
7560fffc 145#ifdef HAVE_GCRYPT
baed47c3
LP
146 if (f->fss_file)
147 munmap(f->fss_file, PAGE_ALIGN(f->fss_file_size));
b7c9ae91
LP
148 else if (f->fsprg_state)
149 free(f->fsprg_state);
150
151 free(f->fsprg_seed);
7560fffc
LP
152
153 if (f->hmac)
154 gcry_md_close(f->hmac);
155#endif
156
cec736d2
LP
157 free(f);
158}
159
0ac38b70 160static int journal_file_init_header(JournalFile *f, JournalFile *template) {
cec736d2
LP
161 Header h;
162 ssize_t k;
163 int r;
164
165 assert(f);
166
167 zero(h);
7560fffc 168 memcpy(h.signature, HEADER_SIGNATURE, 8);
23b0b2b2 169 h.header_size = htole64(ALIGN64(sizeof(h)));
cec736d2 170
7560fffc
LP
171 h.incompatible_flags =
172 htole32(f->compress ? HEADER_INCOMPATIBLE_COMPRESSED : 0);
173
174 h.compatible_flags =
baed47c3 175 htole32(f->seal ? HEADER_COMPATIBLE_SEALED : 0);
7560fffc 176
cec736d2
LP
177 r = sd_id128_randomize(&h.file_id);
178 if (r < 0)
179 return r;
180
0ac38b70
LP
181 if (template) {
182 h.seqnum_id = template->header->seqnum_id;
beec0085 183 h.tail_entry_seqnum = template->header->tail_entry_seqnum;
0ac38b70
LP
184 } else
185 h.seqnum_id = h.file_id;
cec736d2
LP
186
187 k = pwrite(f->fd, &h, sizeof(h), 0);
188 if (k < 0)
189 return -errno;
190
191 if (k != sizeof(h))
192 return -EIO;
193
194 return 0;
195}
196
197static int journal_file_refresh_header(JournalFile *f) {
198 int r;
de190aef 199 sd_id128_t boot_id;
cec736d2
LP
200
201 assert(f);
202
203 r = sd_id128_get_machine(&f->header->machine_id);
204 if (r < 0)
205 return r;
206
de190aef 207 r = sd_id128_get_boot(&boot_id);
cec736d2
LP
208 if (r < 0)
209 return r;
210
de190aef
LP
211 if (sd_id128_equal(boot_id, f->header->boot_id))
212 f->tail_entry_monotonic_valid = true;
213
214 f->header->boot_id = boot_id;
215
26687bf8 216 journal_file_set_online(f);
b788cc23 217
7560fffc 218 /* Sync the online state to disk */
a676e665 219 fsync(f->fd);
b788cc23 220
cec736d2
LP
221 return 0;
222}
223
224static int journal_file_verify_header(JournalFile *f) {
225 assert(f);
226
7560fffc 227 if (memcmp(f->header->signature, HEADER_SIGNATURE, 8))
cec736d2
LP
228 return -EBADMSG;
229
7560fffc
LP
230 /* In both read and write mode we refuse to open files with
231 * incompatible flags we don't know */
807e17f0 232#ifdef HAVE_XZ
7560fffc 233 if ((le32toh(f->header->incompatible_flags) & ~HEADER_INCOMPATIBLE_COMPRESSED) != 0)
807e17f0
LP
234 return -EPROTONOSUPPORT;
235#else
cec736d2
LP
236 if (f->header->incompatible_flags != 0)
237 return -EPROTONOSUPPORT;
807e17f0 238#endif
cec736d2 239
7560fffc
LP
240 /* When open for writing we refuse to open files with
241 * compatible flags, too */
242 if (f->writable) {
243#ifdef HAVE_GCRYPT
baed47c3 244 if ((le32toh(f->header->compatible_flags) & ~HEADER_COMPATIBLE_SEALED) != 0)
7560fffc
LP
245 return -EPROTONOSUPPORT;
246#else
247 if (f->header->compatible_flags != 0)
248 return -EPROTONOSUPPORT;
249#endif
250 }
251
db11ac1a
LP
252 if (f->header->state >= _STATE_MAX)
253 return -EBADMSG;
254
dca6219e
LP
255 /* The first addition was n_data, so check that we are at least this large */
256 if (le64toh(f->header->header_size) < HEADER_SIZE_MIN)
23b0b2b2
LP
257 return -EBADMSG;
258
8088cbd3 259 if (JOURNAL_HEADER_SEALED(f->header) && !JOURNAL_HEADER_CONTAINS(f->header, n_entry_arrays))
beec0085
LP
260 return -EBADMSG;
261
db11ac1a
LP
262 if ((le64toh(f->header->header_size) + le64toh(f->header->arena_size)) > (uint64_t) f->last_stat.st_size)
263 return -ENODATA;
264
265 if (le64toh(f->header->tail_object_offset) > (le64toh(f->header->header_size) + le64toh(f->header->arena_size)))
266 return -ENODATA;
267
7762e02b
LP
268 if (!VALID64(le64toh(f->header->data_hash_table_offset)) ||
269 !VALID64(le64toh(f->header->field_hash_table_offset)) ||
270 !VALID64(le64toh(f->header->tail_object_offset)) ||
271 !VALID64(le64toh(f->header->entry_array_offset)))
272 return -ENODATA;
273
274 if (le64toh(f->header->data_hash_table_offset) < le64toh(f->header->header_size) ||
275 le64toh(f->header->field_hash_table_offset) < le64toh(f->header->header_size) ||
276 le64toh(f->header->tail_object_offset) < le64toh(f->header->header_size) ||
277 le64toh(f->header->entry_array_offset) < le64toh(f->header->header_size))
cec736d2
LP
278 return -ENODATA;
279
280 if (f->writable) {
ccdbaf91 281 uint8_t state;
cec736d2
LP
282 sd_id128_t machine_id;
283 int r;
284
285 r = sd_id128_get_machine(&machine_id);
286 if (r < 0)
287 return r;
288
289 if (!sd_id128_equal(machine_id, f->header->machine_id))
290 return -EHOSTDOWN;
291
de190aef 292 state = f->header->state;
cec736d2 293
71fa6f00
LP
294 if (state == STATE_ONLINE) {
295 log_debug("Journal file %s is already online. Assuming unclean closing.", f->path);
296 return -EBUSY;
297 } else if (state == STATE_ARCHIVED)
cec736d2 298 return -ESHUTDOWN;
71fa6f00
LP
299 else if (state != STATE_OFFLINE) {
300 log_debug("Journal file %s has unknown state %u.", f->path, state);
301 return -EBUSY;
302 }
cec736d2
LP
303 }
304
8088cbd3 305 f->compress = JOURNAL_HEADER_COMPRESSED(f->header);
c586dbf1 306
f1889c91 307 f->seal = JOURNAL_HEADER_SEALED(f->header);
7560fffc 308
cec736d2
LP
309 return 0;
310}
311
312static int journal_file_allocate(JournalFile *f, uint64_t offset, uint64_t size) {
eda4b58b 313 uint64_t old_size, new_size;
fec2aa2f 314 int r;
cec736d2
LP
315
316 assert(f);
317
cec736d2 318 /* We assume that this file is not sparse, and we know that
38ac38b2 319 * for sure, since we always call posix_fallocate()
cec736d2
LP
320 * ourselves */
321
322 old_size =
23b0b2b2 323 le64toh(f->header->header_size) +
cec736d2
LP
324 le64toh(f->header->arena_size);
325
bc85bfee 326 new_size = PAGE_ALIGN(offset + size);
23b0b2b2
LP
327 if (new_size < le64toh(f->header->header_size))
328 new_size = le64toh(f->header->header_size);
bc85bfee
LP
329
330 if (new_size <= old_size)
cec736d2
LP
331 return 0;
332
a676e665 333 if (f->metrics.max_size > 0 && new_size > f->metrics.max_size)
bc85bfee 334 return -E2BIG;
cec736d2 335
a676e665 336 if (new_size > f->metrics.min_size && f->metrics.keep_free > 0) {
cec736d2
LP
337 struct statvfs svfs;
338
339 if (fstatvfs(f->fd, &svfs) >= 0) {
340 uint64_t available;
341
342 available = svfs.f_bfree * svfs.f_bsize;
343
bc85bfee
LP
344 if (available >= f->metrics.keep_free)
345 available -= f->metrics.keep_free;
cec736d2
LP
346 else
347 available = 0;
348
349 if (new_size - old_size > available)
350 return -E2BIG;
351 }
352 }
353
eda4b58b
LP
354 /* Increase by larger blocks at once */
355 new_size = ((new_size+FILE_SIZE_INCREASE-1) / FILE_SIZE_INCREASE) * FILE_SIZE_INCREASE;
356 if (f->metrics.max_size > 0 && new_size > f->metrics.max_size)
357 new_size = f->metrics.max_size;
358
bc85bfee
LP
359 /* Note that the glibc fallocate() fallback is very
360 inefficient, hence we try to minimize the allocation area
361 as we can. */
fec2aa2f
GV
362 r = posix_fallocate(f->fd, old_size, new_size - old_size);
363 if (r != 0)
364 return -r;
cec736d2 365
eda4b58b
LP
366 if (fstat(f->fd, &f->last_stat) < 0)
367 return -errno;
cec736d2 368
23b0b2b2 369 f->header->arena_size = htole64(new_size - le64toh(f->header->header_size));
cec736d2
LP
370
371 return 0;
372}
373
fcde2389 374static int journal_file_move_to(JournalFile *f, int context, bool keep_always, uint64_t offset, uint64_t size, void **ret) {
cec736d2 375 assert(f);
cec736d2
LP
376 assert(ret);
377
7762e02b
LP
378 if (size <= 0)
379 return -EINVAL;
380
2a59ea54 381 /* Avoid SIGBUS on invalid accesses */
4bbdcdb3
LP
382 if (offset + size > (uint64_t) f->last_stat.st_size) {
383 /* Hmm, out of range? Let's refresh the fstat() data
384 * first, before we trust that check. */
385
386 if (fstat(f->fd, &f->last_stat) < 0 ||
387 offset + size > (uint64_t) f->last_stat.st_size)
388 return -EADDRNOTAVAIL;
389 }
390
fcde2389 391 return mmap_cache_get(f->mmap, f->fd, f->prot, context, keep_always, offset, size, &f->last_stat, ret);
cec736d2
LP
392}
393
16e9f408
LP
394static uint64_t minimum_header_size(Object *o) {
395
b8e891e6 396 static const uint64_t table[] = {
16e9f408
LP
397 [OBJECT_DATA] = sizeof(DataObject),
398 [OBJECT_FIELD] = sizeof(FieldObject),
399 [OBJECT_ENTRY] = sizeof(EntryObject),
400 [OBJECT_DATA_HASH_TABLE] = sizeof(HashTableObject),
401 [OBJECT_FIELD_HASH_TABLE] = sizeof(HashTableObject),
402 [OBJECT_ENTRY_ARRAY] = sizeof(EntryArrayObject),
403 [OBJECT_TAG] = sizeof(TagObject),
404 };
405
406 if (o->object.type >= ELEMENTSOF(table) || table[o->object.type] <= 0)
407 return sizeof(ObjectHeader);
408
409 return table[o->object.type];
410}
411
de190aef 412int journal_file_move_to_object(JournalFile *f, int type, uint64_t offset, Object **ret) {
cec736d2
LP
413 int r;
414 void *t;
415 Object *o;
416 uint64_t s;
417
418 assert(f);
419 assert(ret);
420
db11ac1a
LP
421 /* Objects may only be located at multiple of 64 bit */
422 if (!VALID64(offset))
423 return -EFAULT;
424
16e9f408 425
ae97089d 426 r = journal_file_move_to(f, type_to_context(type), false, offset, sizeof(ObjectHeader), &t);
cec736d2
LP
427 if (r < 0)
428 return r;
429
430 o = (Object*) t;
431 s = le64toh(o->object.size);
432
433 if (s < sizeof(ObjectHeader))
434 return -EBADMSG;
435
16e9f408
LP
436 if (o->object.type <= OBJECT_UNUSED)
437 return -EBADMSG;
438
439 if (s < minimum_header_size(o))
440 return -EBADMSG;
441
3c1668da 442 if (type > 0 && o->object.type != type)
cec736d2
LP
443 return -EBADMSG;
444
445 if (s > sizeof(ObjectHeader)) {
fcde2389 446 r = journal_file_move_to(f, o->object.type, false, offset, s, &t);
cec736d2
LP
447 if (r < 0)
448 return r;
449
450 o = (Object*) t;
451 }
452
cec736d2
LP
453 *ret = o;
454 return 0;
455}
456
d98cc1f2 457static uint64_t journal_file_entry_seqnum(JournalFile *f, uint64_t *seqnum) {
cec736d2
LP
458 uint64_t r;
459
460 assert(f);
461
beec0085 462 r = le64toh(f->header->tail_entry_seqnum) + 1;
c2373f84
LP
463
464 if (seqnum) {
de190aef 465 /* If an external seqnum counter was passed, we update
c2373f84
LP
466 * both the local and the external one, and set it to
467 * the maximum of both */
468
469 if (*seqnum + 1 > r)
470 r = *seqnum + 1;
471
472 *seqnum = r;
473 }
474
beec0085 475 f->header->tail_entry_seqnum = htole64(r);
cec736d2 476
beec0085
LP
477 if (f->header->head_entry_seqnum == 0)
478 f->header->head_entry_seqnum = htole64(r);
de190aef 479
cec736d2
LP
480 return r;
481}
482
0284adc6 483int journal_file_append_object(JournalFile *f, int type, uint64_t size, Object **ret, uint64_t *offset) {
cec736d2
LP
484 int r;
485 uint64_t p;
486 Object *tail, *o;
487 void *t;
488
489 assert(f);
16e9f408 490 assert(type > 0 && type < _OBJECT_TYPE_MAX);
cec736d2
LP
491 assert(size >= sizeof(ObjectHeader));
492 assert(offset);
493 assert(ret);
494
26687bf8
OS
495 r = journal_file_set_online(f);
496 if (r < 0)
497 return r;
498
cec736d2 499 p = le64toh(f->header->tail_object_offset);
cec736d2 500 if (p == 0)
23b0b2b2 501 p = le64toh(f->header->header_size);
cec736d2 502 else {
de190aef 503 r = journal_file_move_to_object(f, -1, p, &tail);
cec736d2
LP
504 if (r < 0)
505 return r;
506
507 p += ALIGN64(le64toh(tail->object.size));
508 }
509
510 r = journal_file_allocate(f, p, size);
511 if (r < 0)
512 return r;
513
fcde2389 514 r = journal_file_move_to(f, type, false, p, size, &t);
cec736d2
LP
515 if (r < 0)
516 return r;
517
518 o = (Object*) t;
519
520 zero(o->object);
de190aef 521 o->object.type = type;
cec736d2
LP
522 o->object.size = htole64(size);
523
524 f->header->tail_object_offset = htole64(p);
cec736d2
LP
525 f->header->n_objects = htole64(le64toh(f->header->n_objects) + 1);
526
527 *ret = o;
528 *offset = p;
529
530 return 0;
531}
532
de190aef 533static int journal_file_setup_data_hash_table(JournalFile *f) {
cec736d2
LP
534 uint64_t s, p;
535 Object *o;
536 int r;
537
538 assert(f);
539
dfabe643 540 /* We estimate that we need 1 hash table entry per 768 of
4a92baf3
LP
541 journal file and we want to make sure we never get beyond
542 75% fill level. Calculate the hash table size for the
543 maximum file size based on these metrics. */
544
dfabe643 545 s = (f->metrics.max_size * 4 / 768 / 3) * sizeof(HashItem);
4a92baf3
LP
546 if (s < DEFAULT_DATA_HASH_TABLE_SIZE)
547 s = DEFAULT_DATA_HASH_TABLE_SIZE;
548
507f22bd 549 log_debug("Reserving %"PRIu64" entries in hash table.", s / sizeof(HashItem));
4a92baf3 550
de190aef
LP
551 r = journal_file_append_object(f,
552 OBJECT_DATA_HASH_TABLE,
553 offsetof(Object, hash_table.items) + s,
554 &o, &p);
cec736d2
LP
555 if (r < 0)
556 return r;
557
29804cc1 558 memzero(o->hash_table.items, s);
cec736d2 559
de190aef
LP
560 f->header->data_hash_table_offset = htole64(p + offsetof(Object, hash_table.items));
561 f->header->data_hash_table_size = htole64(s);
cec736d2
LP
562
563 return 0;
564}
565
de190aef 566static int journal_file_setup_field_hash_table(JournalFile *f) {
cec736d2
LP
567 uint64_t s, p;
568 Object *o;
569 int r;
570
571 assert(f);
572
3c1668da
LP
573 /* We use a fixed size hash table for the fields as this
574 * number should grow very slowly only */
575
de190aef
LP
576 s = DEFAULT_FIELD_HASH_TABLE_SIZE;
577 r = journal_file_append_object(f,
578 OBJECT_FIELD_HASH_TABLE,
579 offsetof(Object, hash_table.items) + s,
580 &o, &p);
cec736d2
LP
581 if (r < 0)
582 return r;
583
29804cc1 584 memzero(o->hash_table.items, s);
cec736d2 585
de190aef
LP
586 f->header->field_hash_table_offset = htole64(p + offsetof(Object, hash_table.items));
587 f->header->field_hash_table_size = htole64(s);
cec736d2
LP
588
589 return 0;
590}
591
de190aef 592static int journal_file_map_data_hash_table(JournalFile *f) {
cec736d2
LP
593 uint64_t s, p;
594 void *t;
595 int r;
596
597 assert(f);
598
de190aef
LP
599 p = le64toh(f->header->data_hash_table_offset);
600 s = le64toh(f->header->data_hash_table_size);
cec736d2 601
de190aef 602 r = journal_file_move_to(f,
16e9f408 603 OBJECT_DATA_HASH_TABLE,
fcde2389 604 true,
de190aef
LP
605 p, s,
606 &t);
cec736d2
LP
607 if (r < 0)
608 return r;
609
de190aef 610 f->data_hash_table = t;
cec736d2
LP
611 return 0;
612}
613
de190aef 614static int journal_file_map_field_hash_table(JournalFile *f) {
cec736d2
LP
615 uint64_t s, p;
616 void *t;
617 int r;
618
619 assert(f);
620
de190aef
LP
621 p = le64toh(f->header->field_hash_table_offset);
622 s = le64toh(f->header->field_hash_table_size);
cec736d2 623
de190aef 624 r = journal_file_move_to(f,
16e9f408 625 OBJECT_FIELD_HASH_TABLE,
fcde2389 626 true,
de190aef
LP
627 p, s,
628 &t);
cec736d2
LP
629 if (r < 0)
630 return r;
631
de190aef 632 f->field_hash_table = t;
cec736d2
LP
633 return 0;
634}
635
3c1668da
LP
636static int journal_file_link_field(
637 JournalFile *f,
638 Object *o,
639 uint64_t offset,
640 uint64_t hash) {
641
642 uint64_t p, h;
643 int r;
644
645 assert(f);
646 assert(o);
647 assert(offset > 0);
648
649 if (o->object.type != OBJECT_FIELD)
650 return -EINVAL;
651
652 /* This might alter the window we are looking at */
653
654 o->field.next_hash_offset = o->field.head_data_offset = 0;
655
656 h = hash % (le64toh(f->header->field_hash_table_size) / sizeof(HashItem));
657 p = le64toh(f->field_hash_table[h].tail_hash_offset);
658 if (p == 0)
659 f->field_hash_table[h].head_hash_offset = htole64(offset);
660 else {
661 r = journal_file_move_to_object(f, OBJECT_FIELD, p, &o);
662 if (r < 0)
663 return r;
664
665 o->field.next_hash_offset = htole64(offset);
666 }
667
668 f->field_hash_table[h].tail_hash_offset = htole64(offset);
669
670 if (JOURNAL_HEADER_CONTAINS(f->header, n_fields))
671 f->header->n_fields = htole64(le64toh(f->header->n_fields) + 1);
672
673 return 0;
674}
675
676static int journal_file_link_data(
677 JournalFile *f,
678 Object *o,
679 uint64_t offset,
680 uint64_t hash) {
681
de190aef 682 uint64_t p, h;
cec736d2
LP
683 int r;
684
685 assert(f);
686 assert(o);
687 assert(offset > 0);
b588975f
LP
688
689 if (o->object.type != OBJECT_DATA)
690 return -EINVAL;
cec736d2 691
48496df6
LP
692 /* This might alter the window we are looking at */
693
de190aef
LP
694 o->data.next_hash_offset = o->data.next_field_offset = 0;
695 o->data.entry_offset = o->data.entry_array_offset = 0;
696 o->data.n_entries = 0;
cec736d2 697
de190aef 698 h = hash % (le64toh(f->header->data_hash_table_size) / sizeof(HashItem));
8db4213e 699 p = le64toh(f->data_hash_table[h].tail_hash_offset);
3c1668da 700 if (p == 0)
cec736d2 701 /* Only entry in the hash table is easy */
de190aef 702 f->data_hash_table[h].head_hash_offset = htole64(offset);
3c1668da 703 else {
48496df6
LP
704 /* Move back to the previous data object, to patch in
705 * pointer */
cec736d2 706
de190aef 707 r = journal_file_move_to_object(f, OBJECT_DATA, p, &o);
cec736d2
LP
708 if (r < 0)
709 return r;
710
de190aef 711 o->data.next_hash_offset = htole64(offset);
cec736d2
LP
712 }
713
de190aef 714 f->data_hash_table[h].tail_hash_offset = htole64(offset);
cec736d2 715
dca6219e
LP
716 if (JOURNAL_HEADER_CONTAINS(f->header, n_data))
717 f->header->n_data = htole64(le64toh(f->header->n_data) + 1);
718
cec736d2
LP
719 return 0;
720}
721
3c1668da
LP
722int journal_file_find_field_object_with_hash(
723 JournalFile *f,
724 const void *field, uint64_t size, uint64_t hash,
725 Object **ret, uint64_t *offset) {
726
727 uint64_t p, osize, h;
728 int r;
729
730 assert(f);
731 assert(field && size > 0);
732
733 osize = offsetof(Object, field.payload) + size;
734
735 if (f->header->field_hash_table_size == 0)
736 return -EBADMSG;
737
738 h = hash % (le64toh(f->header->field_hash_table_size) / sizeof(HashItem));
739 p = le64toh(f->field_hash_table[h].head_hash_offset);
740
741 while (p > 0) {
742 Object *o;
743
744 r = journal_file_move_to_object(f, OBJECT_FIELD, p, &o);
745 if (r < 0)
746 return r;
747
748 if (le64toh(o->field.hash) == hash &&
749 le64toh(o->object.size) == osize &&
750 memcmp(o->field.payload, field, size) == 0) {
751
752 if (ret)
753 *ret = o;
754 if (offset)
755 *offset = p;
756
757 return 1;
758 }
759
760 p = le64toh(o->field.next_hash_offset);
761 }
762
763 return 0;
764}
765
766int journal_file_find_field_object(
767 JournalFile *f,
768 const void *field, uint64_t size,
769 Object **ret, uint64_t *offset) {
770
771 uint64_t hash;
772
773 assert(f);
774 assert(field && size > 0);
775
776 hash = hash64(field, size);
777
778 return journal_file_find_field_object_with_hash(f,
779 field, size, hash,
780 ret, offset);
781}
782
de190aef
LP
783int journal_file_find_data_object_with_hash(
784 JournalFile *f,
785 const void *data, uint64_t size, uint64_t hash,
786 Object **ret, uint64_t *offset) {
48496df6 787
de190aef 788 uint64_t p, osize, h;
cec736d2
LP
789 int r;
790
791 assert(f);
792 assert(data || size == 0);
793
794 osize = offsetof(Object, data.payload) + size;
795
bc85bfee
LP
796 if (f->header->data_hash_table_size == 0)
797 return -EBADMSG;
798
de190aef
LP
799 h = hash % (le64toh(f->header->data_hash_table_size) / sizeof(HashItem));
800 p = le64toh(f->data_hash_table[h].head_hash_offset);
cec736d2 801
de190aef
LP
802 while (p > 0) {
803 Object *o;
cec736d2 804
de190aef 805 r = journal_file_move_to_object(f, OBJECT_DATA, p, &o);
cec736d2
LP
806 if (r < 0)
807 return r;
808
807e17f0 809 if (le64toh(o->data.hash) != hash)
85a131e8 810 goto next;
807e17f0
LP
811
812 if (o->object.flags & OBJECT_COMPRESSED) {
813#ifdef HAVE_XZ
b785c858 814 uint64_t l, rsize;
cec736d2 815
807e17f0
LP
816 l = le64toh(o->object.size);
817 if (l <= offsetof(Object, data.payload))
cec736d2
LP
818 return -EBADMSG;
819
807e17f0
LP
820 l -= offsetof(Object, data.payload);
821
93b73b06 822 if (!uncompress_blob(o->data.payload, l, &f->compress_buffer, &f->compress_buffer_size, &rsize, 0))
807e17f0
LP
823 return -EBADMSG;
824
b785c858 825 if (rsize == size &&
807e17f0
LP
826 memcmp(f->compress_buffer, data, size) == 0) {
827
828 if (ret)
829 *ret = o;
830
831 if (offset)
832 *offset = p;
833
834 return 1;
835 }
836#else
837 return -EPROTONOSUPPORT;
838#endif
839
840 } else if (le64toh(o->object.size) == osize &&
841 memcmp(o->data.payload, data, size) == 0) {
842
cec736d2
LP
843 if (ret)
844 *ret = o;
845
846 if (offset)
847 *offset = p;
848
de190aef 849 return 1;
cec736d2
LP
850 }
851
85a131e8 852 next:
cec736d2
LP
853 p = le64toh(o->data.next_hash_offset);
854 }
855
de190aef
LP
856 return 0;
857}
858
859int journal_file_find_data_object(
860 JournalFile *f,
861 const void *data, uint64_t size,
862 Object **ret, uint64_t *offset) {
863
864 uint64_t hash;
865
866 assert(f);
867 assert(data || size == 0);
868
869 hash = hash64(data, size);
870
871 return journal_file_find_data_object_with_hash(f,
872 data, size, hash,
873 ret, offset);
874}
875
3c1668da
LP
876static int journal_file_append_field(
877 JournalFile *f,
878 const void *field, uint64_t size,
879 Object **ret, uint64_t *offset) {
880
881 uint64_t hash, p;
882 uint64_t osize;
883 Object *o;
884 int r;
885
886 assert(f);
887 assert(field && size > 0);
888
889 hash = hash64(field, size);
890
891 r = journal_file_find_field_object_with_hash(f, field, size, hash, &o, &p);
892 if (r < 0)
893 return r;
894 else if (r > 0) {
895
896 if (ret)
897 *ret = o;
898
899 if (offset)
900 *offset = p;
901
902 return 0;
903 }
904
905 osize = offsetof(Object, field.payload) + size;
906 r = journal_file_append_object(f, OBJECT_FIELD, osize, &o, &p);
8c92d4bb
LP
907 if (r < 0)
908 return r;
3c1668da
LP
909
910 o->field.hash = htole64(hash);
911 memcpy(o->field.payload, field, size);
912
913 r = journal_file_link_field(f, o, p, hash);
914 if (r < 0)
915 return r;
916
917 /* The linking might have altered the window, so let's
918 * refresh our pointer */
919 r = journal_file_move_to_object(f, OBJECT_FIELD, p, &o);
920 if (r < 0)
921 return r;
922
923#ifdef HAVE_GCRYPT
924 r = journal_file_hmac_put_object(f, OBJECT_FIELD, o, p);
925 if (r < 0)
926 return r;
927#endif
928
929 if (ret)
930 *ret = o;
931
932 if (offset)
933 *offset = p;
934
935 return 0;
936}
937
48496df6
LP
938static int journal_file_append_data(
939 JournalFile *f,
940 const void *data, uint64_t size,
941 Object **ret, uint64_t *offset) {
942
de190aef
LP
943 uint64_t hash, p;
944 uint64_t osize;
945 Object *o;
946 int r;
807e17f0 947 bool compressed = false;
3c1668da 948 const void *eq;
de190aef
LP
949
950 assert(f);
951 assert(data || size == 0);
952
953 hash = hash64(data, size);
954
955 r = journal_file_find_data_object_with_hash(f, data, size, hash, &o, &p);
956 if (r < 0)
957 return r;
958 else if (r > 0) {
959
960 if (ret)
961 *ret = o;
962
963 if (offset)
964 *offset = p;
965
966 return 0;
967 }
968
969 osize = offsetof(Object, data.payload) + size;
970 r = journal_file_append_object(f, OBJECT_DATA, osize, &o, &p);
cec736d2
LP
971 if (r < 0)
972 return r;
973
cec736d2 974 o->data.hash = htole64(hash);
807e17f0
LP
975
976#ifdef HAVE_XZ
977 if (f->compress &&
978 size >= COMPRESSION_SIZE_THRESHOLD) {
979 uint64_t rsize;
980
981 compressed = compress_blob(data, size, o->data.payload, &rsize);
982
983 if (compressed) {
984 o->object.size = htole64(offsetof(Object, data.payload) + rsize);
985 o->object.flags |= OBJECT_COMPRESSED;
986
507f22bd 987 log_debug("Compressed data object %"PRIu64" -> %"PRIu64, size, rsize);
807e17f0
LP
988 }
989 }
990#endif
991
64825d3c 992 if (!compressed && size > 0)
807e17f0 993 memcpy(o->data.payload, data, size);
cec736d2 994
de190aef 995 r = journal_file_link_data(f, o, p, hash);
cec736d2
LP
996 if (r < 0)
997 return r;
998
48496df6
LP
999 /* The linking might have altered the window, so let's
1000 * refresh our pointer */
1001 r = journal_file_move_to_object(f, OBJECT_DATA, p, &o);
1002 if (r < 0)
1003 return r;
1004
08c6f819
SL
1005 if (!data)
1006 eq = NULL;
1007 else
1008 eq = memchr(data, '=', size);
3c1668da 1009 if (eq && eq > data) {
748db592 1010 Object *fo = NULL;
3c1668da 1011 uint64_t fp;
3c1668da
LP
1012
1013 /* Create field object ... */
1014 r = journal_file_append_field(f, data, (uint8_t*) eq - (uint8_t*) data, &fo, &fp);
1015 if (r < 0)
1016 return r;
1017
1018 /* ... and link it in. */
1019 o->data.next_field_offset = fo->field.head_data_offset;
1020 fo->field.head_data_offset = le64toh(p);
1021 }
1022
5996c7c2
LP
1023#ifdef HAVE_GCRYPT
1024 r = journal_file_hmac_put_object(f, OBJECT_DATA, o, p);
1025 if (r < 0)
1026 return r;
1027#endif
1028
cec736d2
LP
1029 if (ret)
1030 *ret = o;
1031
1032 if (offset)
de190aef 1033 *offset = p;
cec736d2
LP
1034
1035 return 0;
1036}
1037
1038uint64_t journal_file_entry_n_items(Object *o) {
1039 assert(o);
b588975f
LP
1040
1041 if (o->object.type != OBJECT_ENTRY)
1042 return 0;
cec736d2
LP
1043
1044 return (le64toh(o->object.size) - offsetof(Object, entry.items)) / sizeof(EntryItem);
1045}
1046
0284adc6 1047uint64_t journal_file_entry_array_n_items(Object *o) {
de190aef 1048 assert(o);
b588975f
LP
1049
1050 if (o->object.type != OBJECT_ENTRY_ARRAY)
1051 return 0;
de190aef
LP
1052
1053 return (le64toh(o->object.size) - offsetof(Object, entry_array.items)) / sizeof(uint64_t);
1054}
1055
fb9a24b6
LP
1056uint64_t journal_file_hash_table_n_items(Object *o) {
1057 assert(o);
b588975f
LP
1058
1059 if (o->object.type != OBJECT_DATA_HASH_TABLE &&
1060 o->object.type != OBJECT_FIELD_HASH_TABLE)
1061 return 0;
fb9a24b6
LP
1062
1063 return (le64toh(o->object.size) - offsetof(Object, hash_table.items)) / sizeof(HashItem);
1064}
1065
de190aef 1066static int link_entry_into_array(JournalFile *f,
4fd052ae
FC
1067 le64_t *first,
1068 le64_t *idx,
de190aef 1069 uint64_t p) {
cec736d2 1070 int r;
de190aef
LP
1071 uint64_t n = 0, ap = 0, q, i, a, hidx;
1072 Object *o;
1073
cec736d2 1074 assert(f);
de190aef
LP
1075 assert(first);
1076 assert(idx);
1077 assert(p > 0);
cec736d2 1078
de190aef
LP
1079 a = le64toh(*first);
1080 i = hidx = le64toh(*idx);
1081 while (a > 0) {
1082
1083 r = journal_file_move_to_object(f, OBJECT_ENTRY_ARRAY, a, &o);
1084 if (r < 0)
1085 return r;
cec736d2 1086
de190aef
LP
1087 n = journal_file_entry_array_n_items(o);
1088 if (i < n) {
1089 o->entry_array.items[i] = htole64(p);
1090 *idx = htole64(hidx + 1);
1091 return 0;
1092 }
cec736d2 1093
de190aef
LP
1094 i -= n;
1095 ap = a;
1096 a = le64toh(o->entry_array.next_entry_array_offset);
1097 }
1098
1099 if (hidx > n)
1100 n = (hidx+1) * 2;
1101 else
1102 n = n * 2;
1103
1104 if (n < 4)
1105 n = 4;
1106
1107 r = journal_file_append_object(f, OBJECT_ENTRY_ARRAY,
1108 offsetof(Object, entry_array.items) + n * sizeof(uint64_t),
1109 &o, &q);
cec736d2
LP
1110 if (r < 0)
1111 return r;
1112
feb12d3e 1113#ifdef HAVE_GCRYPT
5996c7c2 1114 r = journal_file_hmac_put_object(f, OBJECT_ENTRY_ARRAY, o, q);
b0af6f41
LP
1115 if (r < 0)
1116 return r;
feb12d3e 1117#endif
b0af6f41 1118
de190aef 1119 o->entry_array.items[i] = htole64(p);
cec736d2 1120
de190aef 1121 if (ap == 0)
7be3aa17 1122 *first = htole64(q);
cec736d2 1123 else {
de190aef 1124 r = journal_file_move_to_object(f, OBJECT_ENTRY_ARRAY, ap, &o);
cec736d2
LP
1125 if (r < 0)
1126 return r;
1127
de190aef
LP
1128 o->entry_array.next_entry_array_offset = htole64(q);
1129 }
cec736d2 1130
2dee23eb
LP
1131 if (JOURNAL_HEADER_CONTAINS(f->header, n_entry_arrays))
1132 f->header->n_entry_arrays = htole64(le64toh(f->header->n_entry_arrays) + 1);
1133
de190aef
LP
1134 *idx = htole64(hidx + 1);
1135
1136 return 0;
1137}
cec736d2 1138
de190aef 1139static int link_entry_into_array_plus_one(JournalFile *f,
4fd052ae
FC
1140 le64_t *extra,
1141 le64_t *first,
1142 le64_t *idx,
de190aef
LP
1143 uint64_t p) {
1144
1145 int r;
1146
1147 assert(f);
1148 assert(extra);
1149 assert(first);
1150 assert(idx);
1151 assert(p > 0);
1152
1153 if (*idx == 0)
1154 *extra = htole64(p);
1155 else {
4fd052ae 1156 le64_t i;
de190aef 1157
7be3aa17 1158 i = htole64(le64toh(*idx) - 1);
de190aef
LP
1159 r = link_entry_into_array(f, first, &i, p);
1160 if (r < 0)
1161 return r;
cec736d2
LP
1162 }
1163
de190aef
LP
1164 *idx = htole64(le64toh(*idx) + 1);
1165 return 0;
1166}
1167
1168static int journal_file_link_entry_item(JournalFile *f, Object *o, uint64_t offset, uint64_t i) {
1169 uint64_t p;
1170 int r;
1171 assert(f);
1172 assert(o);
1173 assert(offset > 0);
1174
1175 p = le64toh(o->entry.items[i].object_offset);
1176 if (p == 0)
1177 return -EINVAL;
1178
1179 r = journal_file_move_to_object(f, OBJECT_DATA, p, &o);
cec736d2
LP
1180 if (r < 0)
1181 return r;
1182
de190aef
LP
1183 return link_entry_into_array_plus_one(f,
1184 &o->data.entry_offset,
1185 &o->data.entry_array_offset,
1186 &o->data.n_entries,
1187 offset);
cec736d2
LP
1188}
1189
1190static int journal_file_link_entry(JournalFile *f, Object *o, uint64_t offset) {
de190aef 1191 uint64_t n, i;
cec736d2
LP
1192 int r;
1193
1194 assert(f);
1195 assert(o);
1196 assert(offset > 0);
b588975f
LP
1197
1198 if (o->object.type != OBJECT_ENTRY)
1199 return -EINVAL;
cec736d2 1200
b788cc23
LP
1201 __sync_synchronize();
1202
cec736d2 1203 /* Link up the entry itself */
de190aef
LP
1204 r = link_entry_into_array(f,
1205 &f->header->entry_array_offset,
1206 &f->header->n_entries,
1207 offset);
1208 if (r < 0)
1209 return r;
cec736d2 1210
507f22bd 1211 /* log_debug("=> %s seqnr=%"PRIu64" n_entries=%"PRIu64, f->path, o->entry.seqnum, f->header->n_entries); */
cec736d2 1212
de190aef 1213 if (f->header->head_entry_realtime == 0)
0ac38b70 1214 f->header->head_entry_realtime = o->entry.realtime;
cec736d2 1215
0ac38b70 1216 f->header->tail_entry_realtime = o->entry.realtime;
de190aef
LP
1217 f->header->tail_entry_monotonic = o->entry.monotonic;
1218
1219 f->tail_entry_monotonic_valid = true;
cec736d2
LP
1220
1221 /* Link up the items */
1222 n = journal_file_entry_n_items(o);
1223 for (i = 0; i < n; i++) {
1224 r = journal_file_link_entry_item(f, o, offset, i);
1225 if (r < 0)
1226 return r;
1227 }
1228
cec736d2
LP
1229 return 0;
1230}
1231
1232static int journal_file_append_entry_internal(
1233 JournalFile *f,
1234 const dual_timestamp *ts,
1235 uint64_t xor_hash,
1236 const EntryItem items[], unsigned n_items,
de190aef 1237 uint64_t *seqnum,
cec736d2
LP
1238 Object **ret, uint64_t *offset) {
1239 uint64_t np;
1240 uint64_t osize;
1241 Object *o;
1242 int r;
1243
1244 assert(f);
1245 assert(items || n_items == 0);
de190aef 1246 assert(ts);
cec736d2
LP
1247
1248 osize = offsetof(Object, entry.items) + (n_items * sizeof(EntryItem));
1249
de190aef 1250 r = journal_file_append_object(f, OBJECT_ENTRY, osize, &o, &np);
cec736d2
LP
1251 if (r < 0)
1252 return r;
1253
d98cc1f2 1254 o->entry.seqnum = htole64(journal_file_entry_seqnum(f, seqnum));
cec736d2 1255 memcpy(o->entry.items, items, n_items * sizeof(EntryItem));
de190aef
LP
1256 o->entry.realtime = htole64(ts->realtime);
1257 o->entry.monotonic = htole64(ts->monotonic);
cec736d2
LP
1258 o->entry.xor_hash = htole64(xor_hash);
1259 o->entry.boot_id = f->header->boot_id;
1260
feb12d3e 1261#ifdef HAVE_GCRYPT
5996c7c2 1262 r = journal_file_hmac_put_object(f, OBJECT_ENTRY, o, np);
b0af6f41
LP
1263 if (r < 0)
1264 return r;
feb12d3e 1265#endif
b0af6f41 1266
cec736d2
LP
1267 r = journal_file_link_entry(f, o, np);
1268 if (r < 0)
1269 return r;
1270
1271 if (ret)
1272 *ret = o;
1273
1274 if (offset)
1275 *offset = np;
1276
1277 return 0;
1278}
1279
cf244689 1280void journal_file_post_change(JournalFile *f) {
50f20cfd
LP
1281 assert(f);
1282
1283 /* inotify() does not receive IN_MODIFY events from file
1284 * accesses done via mmap(). After each access we hence
1285 * trigger IN_MODIFY by truncating the journal file to its
1286 * current size which triggers IN_MODIFY. */
1287
bc85bfee
LP
1288 __sync_synchronize();
1289
50f20cfd 1290 if (ftruncate(f->fd, f->last_stat.st_size) < 0)
c5315881 1291 log_error("Failed to truncate file to its own size: %m");
50f20cfd
LP
1292}
1293
1f2da9ec
LP
1294static int entry_item_cmp(const void *_a, const void *_b) {
1295 const EntryItem *a = _a, *b = _b;
1296
1297 if (le64toh(a->object_offset) < le64toh(b->object_offset))
1298 return -1;
1299 if (le64toh(a->object_offset) > le64toh(b->object_offset))
1300 return 1;
1301 return 0;
1302}
1303
de190aef 1304int journal_file_append_entry(JournalFile *f, const dual_timestamp *ts, const struct iovec iovec[], unsigned n_iovec, uint64_t *seqnum, Object **ret, uint64_t *offset) {
cec736d2
LP
1305 unsigned i;
1306 EntryItem *items;
1307 int r;
1308 uint64_t xor_hash = 0;
de190aef 1309 struct dual_timestamp _ts;
cec736d2
LP
1310
1311 assert(f);
1312 assert(iovec || n_iovec == 0);
1313
de190aef
LP
1314 if (!ts) {
1315 dual_timestamp_get(&_ts);
1316 ts = &_ts;
1317 }
1318
1319 if (f->tail_entry_monotonic_valid &&
1320 ts->monotonic < le64toh(f->header->tail_entry_monotonic))
1321 return -EINVAL;
1322
feb12d3e 1323#ifdef HAVE_GCRYPT
7560fffc
LP
1324 r = journal_file_maybe_append_tag(f, ts->realtime);
1325 if (r < 0)
1326 return r;
feb12d3e 1327#endif
7560fffc 1328
64825d3c 1329 /* alloca() can't take 0, hence let's allocate at least one */
9607d947 1330 items = alloca(sizeof(EntryItem) * MAX(1u, n_iovec));
cec736d2
LP
1331
1332 for (i = 0; i < n_iovec; i++) {
1333 uint64_t p;
1334 Object *o;
1335
1336 r = journal_file_append_data(f, iovec[i].iov_base, iovec[i].iov_len, &o, &p);
1337 if (r < 0)
cf244689 1338 return r;
cec736d2
LP
1339
1340 xor_hash ^= le64toh(o->data.hash);
1341 items[i].object_offset = htole64(p);
de7b95cd 1342 items[i].hash = o->data.hash;
cec736d2
LP
1343 }
1344
1f2da9ec
LP
1345 /* Order by the position on disk, in order to improve seek
1346 * times for rotating media. */
7ff7394d 1347 qsort_safe(items, n_iovec, sizeof(EntryItem), entry_item_cmp);
1f2da9ec 1348
de190aef 1349 r = journal_file_append_entry_internal(f, ts, xor_hash, items, n_iovec, seqnum, ret, offset);
cec736d2 1350
50f20cfd
LP
1351 journal_file_post_change(f);
1352
cec736d2
LP
1353 return r;
1354}
1355
a4bcff5b 1356typedef struct ChainCacheItem {
fb099c8d 1357 uint64_t first; /* the array at the beginning of the chain */
a4bcff5b
LP
1358 uint64_t array; /* the cached array */
1359 uint64_t begin; /* the first item in the cached array */
1360 uint64_t total; /* the total number of items in all arrays before this one in the chain */
f268980d 1361 uint64_t last_index; /* the last index we looked at, to optimize locality when bisecting */
a4bcff5b
LP
1362} ChainCacheItem;
1363
1364static void chain_cache_put(
1365 Hashmap *h,
1366 ChainCacheItem *ci,
1367 uint64_t first,
1368 uint64_t array,
1369 uint64_t begin,
f268980d
LP
1370 uint64_t total,
1371 uint64_t last_index) {
a4bcff5b
LP
1372
1373 if (!ci) {
34741aa3
LP
1374 /* If the chain item to cache for this chain is the
1375 * first one it's not worth caching anything */
1376 if (array == first)
1377 return;
1378
a4bcff5b
LP
1379 if (hashmap_size(h) >= CHAIN_CACHE_MAX)
1380 ci = hashmap_steal_first(h);
1381 else {
1382 ci = new(ChainCacheItem, 1);
1383 if (!ci)
1384 return;
1385 }
1386
1387 ci->first = first;
1388
1389 if (hashmap_put(h, &ci->first, ci) < 0) {
1390 free(ci);
1391 return;
1392 }
1393 } else
1394 assert(ci->first == first);
1395
1396 ci->array = array;
1397 ci->begin = begin;
1398 ci->total = total;
f268980d 1399 ci->last_index = last_index;
a4bcff5b
LP
1400}
1401
f268980d
LP
1402static int generic_array_get(
1403 JournalFile *f,
1404 uint64_t first,
1405 uint64_t i,
1406 Object **ret, uint64_t *offset) {
de190aef 1407
cec736d2 1408 Object *o;
a4bcff5b 1409 uint64_t p = 0, a, t = 0;
cec736d2 1410 int r;
a4bcff5b 1411 ChainCacheItem *ci;
cec736d2
LP
1412
1413 assert(f);
1414
de190aef 1415 a = first;
a4bcff5b
LP
1416
1417 /* Try the chain cache first */
1418 ci = hashmap_get(f->chain_cache, &first);
1419 if (ci && i > ci->total) {
1420 a = ci->array;
1421 i -= ci->total;
1422 t = ci->total;
1423 }
1424
de190aef 1425 while (a > 0) {
a4bcff5b 1426 uint64_t k;
cec736d2 1427
de190aef
LP
1428 r = journal_file_move_to_object(f, OBJECT_ENTRY_ARRAY, a, &o);
1429 if (r < 0)
1430 return r;
cec736d2 1431
a4bcff5b
LP
1432 k = journal_file_entry_array_n_items(o);
1433 if (i < k) {
de190aef 1434 p = le64toh(o->entry_array.items[i]);
a4bcff5b 1435 goto found;
cec736d2
LP
1436 }
1437
a4bcff5b
LP
1438 i -= k;
1439 t += k;
de190aef
LP
1440 a = le64toh(o->entry_array.next_entry_array_offset);
1441 }
1442
a4bcff5b
LP
1443 return 0;
1444
1445found:
1446 /* Let's cache this item for the next invocation */
af13a6b0 1447 chain_cache_put(f->chain_cache, ci, first, a, le64toh(o->entry_array.items[0]), t, i);
de190aef
LP
1448
1449 r = journal_file_move_to_object(f, OBJECT_ENTRY, p, &o);
1450 if (r < 0)
1451 return r;
1452
1453 if (ret)
1454 *ret = o;
1455
1456 if (offset)
1457 *offset = p;
1458
1459 return 1;
1460}
1461
f268980d
LP
1462static int generic_array_get_plus_one(
1463 JournalFile *f,
1464 uint64_t extra,
1465 uint64_t first,
1466 uint64_t i,
1467 Object **ret, uint64_t *offset) {
de190aef
LP
1468
1469 Object *o;
1470
1471 assert(f);
1472
1473 if (i == 0) {
1474 int r;
1475
1476 r = journal_file_move_to_object(f, OBJECT_ENTRY, extra, &o);
cec736d2
LP
1477 if (r < 0)
1478 return r;
1479
de190aef
LP
1480 if (ret)
1481 *ret = o;
cec736d2 1482
de190aef
LP
1483 if (offset)
1484 *offset = extra;
cec736d2 1485
de190aef 1486 return 1;
cec736d2
LP
1487 }
1488
de190aef
LP
1489 return generic_array_get(f, first, i-1, ret, offset);
1490}
cec736d2 1491
de190aef
LP
1492enum {
1493 TEST_FOUND,
1494 TEST_LEFT,
1495 TEST_RIGHT
1496};
cec736d2 1497
f268980d
LP
1498static int generic_array_bisect(
1499 JournalFile *f,
1500 uint64_t first,
1501 uint64_t n,
1502 uint64_t needle,
1503 int (*test_object)(JournalFile *f, uint64_t p, uint64_t needle),
1504 direction_t direction,
1505 Object **ret,
1506 uint64_t *offset,
1507 uint64_t *idx) {
1508
1509 uint64_t a, p, t = 0, i = 0, last_p = 0, last_index = (uint64_t) -1;
de190aef
LP
1510 bool subtract_one = false;
1511 Object *o, *array = NULL;
1512 int r;
a4bcff5b 1513 ChainCacheItem *ci;
cec736d2 1514
de190aef
LP
1515 assert(f);
1516 assert(test_object);
cec736d2 1517
a4bcff5b 1518 /* Start with the first array in the chain */
de190aef 1519 a = first;
a4bcff5b
LP
1520
1521 ci = hashmap_get(f->chain_cache, &first);
1522 if (ci && n > ci->total) {
1523 /* Ah, we have iterated this bisection array chain
1524 * previously! Let's see if we can skip ahead in the
1525 * chain, as far as the last time. But we can't jump
1526 * backwards in the chain, so let's check that
1527 * first. */
1528
1529 r = test_object(f, ci->begin, needle);
1530 if (r < 0)
1531 return r;
1532
1533 if (r == TEST_LEFT) {
f268980d 1534 /* OK, what we are looking for is right of the
a4bcff5b
LP
1535 * begin of this EntryArray, so let's jump
1536 * straight to previously cached array in the
1537 * chain */
1538
1539 a = ci->array;
1540 n -= ci->total;
1541 t = ci->total;
f268980d 1542 last_index = ci->last_index;
a4bcff5b
LP
1543 }
1544 }
1545
de190aef
LP
1546 while (a > 0) {
1547 uint64_t left, right, k, lp;
1548
1549 r = journal_file_move_to_object(f, OBJECT_ENTRY_ARRAY, a, &array);
cec736d2
LP
1550 if (r < 0)
1551 return r;
1552
de190aef
LP
1553 k = journal_file_entry_array_n_items(array);
1554 right = MIN(k, n);
1555 if (right <= 0)
1556 return 0;
cec736d2 1557
de190aef
LP
1558 i = right - 1;
1559 lp = p = le64toh(array->entry_array.items[i]);
1560 if (p <= 0)
1561 return -EBADMSG;
cec736d2 1562
de190aef
LP
1563 r = test_object(f, p, needle);
1564 if (r < 0)
1565 return r;
cec736d2 1566
de190aef
LP
1567 if (r == TEST_FOUND)
1568 r = direction == DIRECTION_DOWN ? TEST_RIGHT : TEST_LEFT;
1569
1570 if (r == TEST_RIGHT) {
1571 left = 0;
1572 right -= 1;
f268980d
LP
1573
1574 if (last_index != (uint64_t) -1) {
1575 assert(last_index <= right);
1576
1577 /* If we cached the last index we
1578 * looked at, let's try to not to jump
1579 * too wildly around and see if we can
1580 * limit the range to look at early to
1581 * the immediate neighbors of the last
1582 * index we looked at. */
1583
1584 if (last_index > 0) {
1585 uint64_t x = last_index - 1;
1586
1587 p = le64toh(array->entry_array.items[x]);
1588 if (p <= 0)
1589 return -EBADMSG;
1590
1591 r = test_object(f, p, needle);
1592 if (r < 0)
1593 return r;
1594
1595 if (r == TEST_FOUND)
1596 r = direction == DIRECTION_DOWN ? TEST_RIGHT : TEST_LEFT;
1597
1598 if (r == TEST_RIGHT)
1599 right = x;
1600 else
1601 left = x + 1;
1602 }
1603
1604 if (last_index < right) {
1605 uint64_t y = last_index + 1;
1606
1607 p = le64toh(array->entry_array.items[y]);
1608 if (p <= 0)
1609 return -EBADMSG;
1610
1611 r = test_object(f, p, needle);
1612 if (r < 0)
1613 return r;
1614
1615 if (r == TEST_FOUND)
1616 r = direction == DIRECTION_DOWN ? TEST_RIGHT : TEST_LEFT;
1617
1618 if (r == TEST_RIGHT)
1619 right = y;
1620 else
1621 left = y + 1;
1622 }
f268980d
LP
1623 }
1624
de190aef
LP
1625 for (;;) {
1626 if (left == right) {
1627 if (direction == DIRECTION_UP)
1628 subtract_one = true;
1629
1630 i = left;
1631 goto found;
1632 }
1633
1634 assert(left < right);
de190aef 1635 i = (left + right) / 2;
f268980d 1636
de190aef
LP
1637 p = le64toh(array->entry_array.items[i]);
1638 if (p <= 0)
1639 return -EBADMSG;
1640
1641 r = test_object(f, p, needle);
1642 if (r < 0)
1643 return r;
cec736d2 1644
de190aef
LP
1645 if (r == TEST_FOUND)
1646 r = direction == DIRECTION_DOWN ? TEST_RIGHT : TEST_LEFT;
1647
1648 if (r == TEST_RIGHT)
1649 right = i;
1650 else
1651 left = i + 1;
1652 }
1653 }
1654
cbdca852
LP
1655 if (k > n) {
1656 if (direction == DIRECTION_UP) {
1657 i = n;
1658 subtract_one = true;
1659 goto found;
1660 }
1661
cec736d2 1662 return 0;
cbdca852 1663 }
cec736d2 1664
de190aef
LP
1665 last_p = lp;
1666
1667 n -= k;
1668 t += k;
f268980d 1669 last_index = (uint64_t) -1;
de190aef 1670 a = le64toh(array->entry_array.next_entry_array_offset);
cec736d2
LP
1671 }
1672
1673 return 0;
de190aef
LP
1674
1675found:
1676 if (subtract_one && t == 0 && i == 0)
1677 return 0;
1678
a4bcff5b 1679 /* Let's cache this item for the next invocation */
af13a6b0 1680 chain_cache_put(f->chain_cache, ci, first, a, le64toh(array->entry_array.items[0]), t, subtract_one ? (i > 0 ? i-1 : (uint64_t) -1) : i);
a4bcff5b 1681
de190aef
LP
1682 if (subtract_one && i == 0)
1683 p = last_p;
1684 else if (subtract_one)
1685 p = le64toh(array->entry_array.items[i-1]);
1686 else
1687 p = le64toh(array->entry_array.items[i]);
1688
1689 r = journal_file_move_to_object(f, OBJECT_ENTRY, p, &o);
1690 if (r < 0)
1691 return r;
1692
1693 if (ret)
1694 *ret = o;
1695
1696 if (offset)
1697 *offset = p;
1698
1699 if (idx)
cbdca852 1700 *idx = t + i + (subtract_one ? -1 : 0);
de190aef
LP
1701
1702 return 1;
cec736d2
LP
1703}
1704
f268980d
LP
1705
1706static int generic_array_bisect_plus_one(
1707 JournalFile *f,
1708 uint64_t extra,
1709 uint64_t first,
1710 uint64_t n,
1711 uint64_t needle,
1712 int (*test_object)(JournalFile *f, uint64_t p, uint64_t needle),
1713 direction_t direction,
1714 Object **ret,
1715 uint64_t *offset,
1716 uint64_t *idx) {
de190aef 1717
cec736d2 1718 int r;
cbdca852
LP
1719 bool step_back = false;
1720 Object *o;
cec736d2
LP
1721
1722 assert(f);
de190aef 1723 assert(test_object);
cec736d2 1724
de190aef
LP
1725 if (n <= 0)
1726 return 0;
cec736d2 1727
de190aef
LP
1728 /* This bisects the array in object 'first', but first checks
1729 * an extra */
de190aef
LP
1730 r = test_object(f, extra, needle);
1731 if (r < 0)
1732 return r;
a536e261
LP
1733
1734 if (r == TEST_FOUND)
1735 r = direction == DIRECTION_DOWN ? TEST_RIGHT : TEST_LEFT;
1736
cbdca852
LP
1737 /* if we are looking with DIRECTION_UP then we need to first
1738 see if in the actual array there is a matching entry, and
1739 return the last one of that. But if there isn't any we need
1740 to return this one. Hence remember this, and return it
1741 below. */
1742 if (r == TEST_LEFT)
1743 step_back = direction == DIRECTION_UP;
de190aef 1744
cbdca852
LP
1745 if (r == TEST_RIGHT) {
1746 if (direction == DIRECTION_DOWN)
1747 goto found;
1748 else
1749 return 0;
a536e261 1750 }
cec736d2 1751
de190aef
LP
1752 r = generic_array_bisect(f, first, n-1, needle, test_object, direction, ret, offset, idx);
1753
cbdca852
LP
1754 if (r == 0 && step_back)
1755 goto found;
1756
ecf68b1d 1757 if (r > 0 && idx)
de190aef
LP
1758 (*idx) ++;
1759
1760 return r;
cbdca852
LP
1761
1762found:
1763 r = journal_file_move_to_object(f, OBJECT_ENTRY, extra, &o);
1764 if (r < 0)
1765 return r;
1766
1767 if (ret)
1768 *ret = o;
1769
1770 if (offset)
1771 *offset = extra;
1772
1773 if (idx)
1774 *idx = 0;
1775
1776 return 1;
1777}
1778
44a6b1b6 1779_pure_ static int test_object_offset(JournalFile *f, uint64_t p, uint64_t needle) {
cbdca852
LP
1780 assert(f);
1781 assert(p > 0);
1782
1783 if (p == needle)
1784 return TEST_FOUND;
1785 else if (p < needle)
1786 return TEST_LEFT;
1787 else
1788 return TEST_RIGHT;
1789}
1790
1791int journal_file_move_to_entry_by_offset(
1792 JournalFile *f,
1793 uint64_t p,
1794 direction_t direction,
1795 Object **ret,
1796 uint64_t *offset) {
1797
1798 return generic_array_bisect(f,
1799 le64toh(f->header->entry_array_offset),
1800 le64toh(f->header->n_entries),
1801 p,
1802 test_object_offset,
1803 direction,
1804 ret, offset, NULL);
de190aef
LP
1805}
1806
cbdca852 1807
de190aef
LP
1808static int test_object_seqnum(JournalFile *f, uint64_t p, uint64_t needle) {
1809 Object *o;
1810 int r;
1811
1812 assert(f);
1813 assert(p > 0);
1814
1815 r = journal_file_move_to_object(f, OBJECT_ENTRY, p, &o);
cec736d2
LP
1816 if (r < 0)
1817 return r;
1818
de190aef
LP
1819 if (le64toh(o->entry.seqnum) == needle)
1820 return TEST_FOUND;
1821 else if (le64toh(o->entry.seqnum) < needle)
1822 return TEST_LEFT;
1823 else
1824 return TEST_RIGHT;
1825}
cec736d2 1826
de190aef
LP
1827int journal_file_move_to_entry_by_seqnum(
1828 JournalFile *f,
1829 uint64_t seqnum,
1830 direction_t direction,
1831 Object **ret,
1832 uint64_t *offset) {
1833
1834 return generic_array_bisect(f,
1835 le64toh(f->header->entry_array_offset),
1836 le64toh(f->header->n_entries),
1837 seqnum,
1838 test_object_seqnum,
1839 direction,
1840 ret, offset, NULL);
1841}
cec736d2 1842
de190aef
LP
1843static int test_object_realtime(JournalFile *f, uint64_t p, uint64_t needle) {
1844 Object *o;
1845 int r;
1846
1847 assert(f);
1848 assert(p > 0);
1849
1850 r = journal_file_move_to_object(f, OBJECT_ENTRY, p, &o);
1851 if (r < 0)
1852 return r;
1853
1854 if (le64toh(o->entry.realtime) == needle)
1855 return TEST_FOUND;
1856 else if (le64toh(o->entry.realtime) < needle)
1857 return TEST_LEFT;
1858 else
1859 return TEST_RIGHT;
cec736d2
LP
1860}
1861
de190aef
LP
1862int journal_file_move_to_entry_by_realtime(
1863 JournalFile *f,
1864 uint64_t realtime,
1865 direction_t direction,
1866 Object **ret,
1867 uint64_t *offset) {
1868
1869 return generic_array_bisect(f,
1870 le64toh(f->header->entry_array_offset),
1871 le64toh(f->header->n_entries),
1872 realtime,
1873 test_object_realtime,
1874 direction,
1875 ret, offset, NULL);
1876}
1877
1878static int test_object_monotonic(JournalFile *f, uint64_t p, uint64_t needle) {
1879 Object *o;
1880 int r;
1881
1882 assert(f);
1883 assert(p > 0);
1884
1885 r = journal_file_move_to_object(f, OBJECT_ENTRY, p, &o);
1886 if (r < 0)
1887 return r;
1888
1889 if (le64toh(o->entry.monotonic) == needle)
1890 return TEST_FOUND;
1891 else if (le64toh(o->entry.monotonic) < needle)
1892 return TEST_LEFT;
1893 else
1894 return TEST_RIGHT;
1895}
1896
47838ab3
ZJS
1897static inline int find_data_object_by_boot_id(
1898 JournalFile *f,
1899 sd_id128_t boot_id,
1900 Object **o,
1901 uint64_t *b) {
1902 char t[sizeof("_BOOT_ID=")-1 + 32 + 1] = "_BOOT_ID=";
1903
1904 sd_id128_to_string(boot_id, t + 9);
1905 return journal_file_find_data_object(f, t, sizeof(t) - 1, o, b);
1906}
1907
de190aef
LP
1908int journal_file_move_to_entry_by_monotonic(
1909 JournalFile *f,
1910 sd_id128_t boot_id,
1911 uint64_t monotonic,
1912 direction_t direction,
1913 Object **ret,
1914 uint64_t *offset) {
1915
de190aef
LP
1916 Object *o;
1917 int r;
1918
cbdca852 1919 assert(f);
de190aef 1920
47838ab3 1921 r = find_data_object_by_boot_id(f, boot_id, &o, NULL);
de190aef
LP
1922 if (r < 0)
1923 return r;
cbdca852 1924 if (r == 0)
de190aef
LP
1925 return -ENOENT;
1926
1927 return generic_array_bisect_plus_one(f,
1928 le64toh(o->data.entry_offset),
1929 le64toh(o->data.entry_array_offset),
1930 le64toh(o->data.n_entries),
1931 monotonic,
1932 test_object_monotonic,
1933 direction,
1934 ret, offset, NULL);
1935}
1936
de190aef
LP
1937int journal_file_next_entry(
1938 JournalFile *f,
1939 Object *o, uint64_t p,
1940 direction_t direction,
1941 Object **ret, uint64_t *offset) {
1942
fb099c8d 1943 uint64_t i, n, ofs;
cec736d2
LP
1944 int r;
1945
1946 assert(f);
de190aef
LP
1947 assert(p > 0 || !o);
1948
1949 n = le64toh(f->header->n_entries);
1950 if (n <= 0)
1951 return 0;
cec736d2
LP
1952
1953 if (!o)
de190aef 1954 i = direction == DIRECTION_DOWN ? 0 : n - 1;
cec736d2 1955 else {
de190aef 1956 if (o->object.type != OBJECT_ENTRY)
cec736d2
LP
1957 return -EINVAL;
1958
de190aef
LP
1959 r = generic_array_bisect(f,
1960 le64toh(f->header->entry_array_offset),
1961 le64toh(f->header->n_entries),
1962 p,
1963 test_object_offset,
1964 DIRECTION_DOWN,
1965 NULL, NULL,
1966 &i);
1967 if (r <= 0)
1968 return r;
1969
1970 if (direction == DIRECTION_DOWN) {
1971 if (i >= n - 1)
1972 return 0;
1973
1974 i++;
1975 } else {
1976 if (i <= 0)
1977 return 0;
1978
1979 i--;
1980 }
cec736d2
LP
1981 }
1982
de190aef 1983 /* And jump to it */
fb099c8d
ZJS
1984 r = generic_array_get(f,
1985 le64toh(f->header->entry_array_offset),
1986 i,
1987 ret, &ofs);
1988 if (r <= 0)
1989 return r;
1990
1991 if (p > 0 &&
1992 (direction == DIRECTION_DOWN ? ofs <= p : ofs >= p)) {
1993 log_debug("%s: entry array corrupted at entry %"PRIu64,
1994 f->path, i);
1995 return -EBADMSG;
1996 }
1997
1998 if (offset)
1999 *offset = ofs;
2000
2001 return 1;
de190aef 2002}
cec736d2 2003
de190aef
LP
2004int journal_file_skip_entry(
2005 JournalFile *f,
2006 Object *o, uint64_t p,
2007 int64_t skip,
2008 Object **ret, uint64_t *offset) {
2009
2010 uint64_t i, n;
2011 int r;
2012
2013 assert(f);
2014 assert(o);
2015 assert(p > 0);
2016
2017 if (o->object.type != OBJECT_ENTRY)
2018 return -EINVAL;
2019
2020 r = generic_array_bisect(f,
2021 le64toh(f->header->entry_array_offset),
2022 le64toh(f->header->n_entries),
2023 p,
2024 test_object_offset,
2025 DIRECTION_DOWN,
2026 NULL, NULL,
2027 &i);
2028 if (r <= 0)
cec736d2
LP
2029 return r;
2030
de190aef
LP
2031 /* Calculate new index */
2032 if (skip < 0) {
2033 if ((uint64_t) -skip >= i)
2034 i = 0;
2035 else
2036 i = i - (uint64_t) -skip;
2037 } else
2038 i += (uint64_t) skip;
cec736d2 2039
de190aef
LP
2040 n = le64toh(f->header->n_entries);
2041 if (n <= 0)
2042 return -EBADMSG;
cec736d2 2043
de190aef
LP
2044 if (i >= n)
2045 i = n-1;
2046
2047 return generic_array_get(f,
2048 le64toh(f->header->entry_array_offset),
2049 i,
2050 ret, offset);
cec736d2
LP
2051}
2052
de190aef
LP
2053int journal_file_next_entry_for_data(
2054 JournalFile *f,
2055 Object *o, uint64_t p,
2056 uint64_t data_offset,
2057 direction_t direction,
2058 Object **ret, uint64_t *offset) {
2059
2060 uint64_t n, i;
cec736d2 2061 int r;
de190aef 2062 Object *d;
cec736d2
LP
2063
2064 assert(f);
de190aef 2065 assert(p > 0 || !o);
cec736d2 2066
de190aef 2067 r = journal_file_move_to_object(f, OBJECT_DATA, data_offset, &d);
466ccd92 2068 if (r < 0)
de190aef 2069 return r;
cec736d2 2070
de190aef
LP
2071 n = le64toh(d->data.n_entries);
2072 if (n <= 0)
2073 return n;
cec736d2 2074
de190aef
LP
2075 if (!o)
2076 i = direction == DIRECTION_DOWN ? 0 : n - 1;
2077 else {
2078 if (o->object.type != OBJECT_ENTRY)
2079 return -EINVAL;
cec736d2 2080
de190aef
LP
2081 r = generic_array_bisect_plus_one(f,
2082 le64toh(d->data.entry_offset),
2083 le64toh(d->data.entry_array_offset),
2084 le64toh(d->data.n_entries),
2085 p,
2086 test_object_offset,
2087 DIRECTION_DOWN,
2088 NULL, NULL,
2089 &i);
2090
2091 if (r <= 0)
cec736d2
LP
2092 return r;
2093
de190aef
LP
2094 if (direction == DIRECTION_DOWN) {
2095 if (i >= n - 1)
2096 return 0;
cec736d2 2097
de190aef
LP
2098 i++;
2099 } else {
2100 if (i <= 0)
2101 return 0;
cec736d2 2102
de190aef
LP
2103 i--;
2104 }
cec736d2 2105
de190aef 2106 }
cec736d2 2107
de190aef
LP
2108 return generic_array_get_plus_one(f,
2109 le64toh(d->data.entry_offset),
2110 le64toh(d->data.entry_array_offset),
2111 i,
2112 ret, offset);
2113}
cec736d2 2114
cbdca852
LP
2115int journal_file_move_to_entry_by_offset_for_data(
2116 JournalFile *f,
2117 uint64_t data_offset,
2118 uint64_t p,
2119 direction_t direction,
2120 Object **ret, uint64_t *offset) {
2121
2122 int r;
2123 Object *d;
2124
2125 assert(f);
2126
2127 r = journal_file_move_to_object(f, OBJECT_DATA, data_offset, &d);
2128 if (r < 0)
2129 return r;
2130
2131 return generic_array_bisect_plus_one(f,
2132 le64toh(d->data.entry_offset),
2133 le64toh(d->data.entry_array_offset),
2134 le64toh(d->data.n_entries),
2135 p,
2136 test_object_offset,
2137 direction,
2138 ret, offset, NULL);
2139}
2140
2141int journal_file_move_to_entry_by_monotonic_for_data(
2142 JournalFile *f,
2143 uint64_t data_offset,
2144 sd_id128_t boot_id,
2145 uint64_t monotonic,
2146 direction_t direction,
2147 Object **ret, uint64_t *offset) {
2148
cbdca852
LP
2149 Object *o, *d;
2150 int r;
2151 uint64_t b, z;
2152
2153 assert(f);
2154
2155 /* First, seek by time */
47838ab3 2156 r = find_data_object_by_boot_id(f, boot_id, &o, &b);
cbdca852
LP
2157 if (r < 0)
2158 return r;
2159 if (r == 0)
2160 return -ENOENT;
2161
2162 r = generic_array_bisect_plus_one(f,
2163 le64toh(o->data.entry_offset),
2164 le64toh(o->data.entry_array_offset),
2165 le64toh(o->data.n_entries),
2166 monotonic,
2167 test_object_monotonic,
2168 direction,
2169 NULL, &z, NULL);
2170 if (r <= 0)
2171 return r;
2172
2173 /* And now, continue seeking until we find an entry that
2174 * exists in both bisection arrays */
2175
2176 for (;;) {
2177 Object *qo;
2178 uint64_t p, q;
2179
2180 r = journal_file_move_to_object(f, OBJECT_DATA, data_offset, &d);
2181 if (r < 0)
2182 return r;
2183
2184 r = generic_array_bisect_plus_one(f,
2185 le64toh(d->data.entry_offset),
2186 le64toh(d->data.entry_array_offset),
2187 le64toh(d->data.n_entries),
2188 z,
2189 test_object_offset,
2190 direction,
2191 NULL, &p, NULL);
2192 if (r <= 0)
2193 return r;
2194
2195 r = journal_file_move_to_object(f, OBJECT_DATA, b, &o);
2196 if (r < 0)
2197 return r;
2198
2199 r = generic_array_bisect_plus_one(f,
2200 le64toh(o->data.entry_offset),
2201 le64toh(o->data.entry_array_offset),
2202 le64toh(o->data.n_entries),
2203 p,
2204 test_object_offset,
2205 direction,
2206 &qo, &q, NULL);
2207
2208 if (r <= 0)
2209 return r;
2210
2211 if (p == q) {
2212 if (ret)
2213 *ret = qo;
2214 if (offset)
2215 *offset = q;
2216
2217 return 1;
2218 }
2219
2220 z = q;
2221 }
cbdca852
LP
2222}
2223
de190aef
LP
2224int journal_file_move_to_entry_by_seqnum_for_data(
2225 JournalFile *f,
2226 uint64_t data_offset,
2227 uint64_t seqnum,
2228 direction_t direction,
2229 Object **ret, uint64_t *offset) {
cec736d2 2230
de190aef
LP
2231 Object *d;
2232 int r;
cec736d2 2233
91a31dde
LP
2234 assert(f);
2235
de190aef 2236 r = journal_file_move_to_object(f, OBJECT_DATA, data_offset, &d);
91a31dde 2237 if (r < 0)
de190aef 2238 return r;
cec736d2 2239
de190aef
LP
2240 return generic_array_bisect_plus_one(f,
2241 le64toh(d->data.entry_offset),
2242 le64toh(d->data.entry_array_offset),
2243 le64toh(d->data.n_entries),
2244 seqnum,
2245 test_object_seqnum,
2246 direction,
2247 ret, offset, NULL);
2248}
cec736d2 2249
de190aef
LP
2250int journal_file_move_to_entry_by_realtime_for_data(
2251 JournalFile *f,
2252 uint64_t data_offset,
2253 uint64_t realtime,
2254 direction_t direction,
2255 Object **ret, uint64_t *offset) {
2256
2257 Object *d;
2258 int r;
2259
91a31dde
LP
2260 assert(f);
2261
de190aef 2262 r = journal_file_move_to_object(f, OBJECT_DATA, data_offset, &d);
91a31dde 2263 if (r < 0)
de190aef
LP
2264 return r;
2265
2266 return generic_array_bisect_plus_one(f,
2267 le64toh(d->data.entry_offset),
2268 le64toh(d->data.entry_array_offset),
2269 le64toh(d->data.n_entries),
2270 realtime,
2271 test_object_realtime,
2272 direction,
2273 ret, offset, NULL);
cec736d2
LP
2274}
2275
0284adc6 2276void journal_file_dump(JournalFile *f) {
7560fffc 2277 Object *o;
7560fffc 2278 int r;
0284adc6 2279 uint64_t p;
7560fffc
LP
2280
2281 assert(f);
2282
0284adc6 2283 journal_file_print_header(f);
7560fffc 2284
0284adc6
LP
2285 p = le64toh(f->header->header_size);
2286 while (p != 0) {
2287 r = journal_file_move_to_object(f, -1, p, &o);
2288 if (r < 0)
2289 goto fail;
7560fffc 2290
0284adc6 2291 switch (o->object.type) {
d98cc1f2 2292
0284adc6
LP
2293 case OBJECT_UNUSED:
2294 printf("Type: OBJECT_UNUSED\n");
2295 break;
d98cc1f2 2296
0284adc6
LP
2297 case OBJECT_DATA:
2298 printf("Type: OBJECT_DATA\n");
2299 break;
7560fffc 2300
3c1668da
LP
2301 case OBJECT_FIELD:
2302 printf("Type: OBJECT_FIELD\n");
2303 break;
2304
0284adc6 2305 case OBJECT_ENTRY:
507f22bd
ZJS
2306 printf("Type: OBJECT_ENTRY seqnum=%"PRIu64" monotonic=%"PRIu64" realtime=%"PRIu64"\n",
2307 le64toh(o->entry.seqnum),
2308 le64toh(o->entry.monotonic),
2309 le64toh(o->entry.realtime));
0284adc6 2310 break;
7560fffc 2311
0284adc6
LP
2312 case OBJECT_FIELD_HASH_TABLE:
2313 printf("Type: OBJECT_FIELD_HASH_TABLE\n");
2314 break;
7560fffc 2315
0284adc6
LP
2316 case OBJECT_DATA_HASH_TABLE:
2317 printf("Type: OBJECT_DATA_HASH_TABLE\n");
2318 break;
7560fffc 2319
0284adc6
LP
2320 case OBJECT_ENTRY_ARRAY:
2321 printf("Type: OBJECT_ENTRY_ARRAY\n");
2322 break;
7560fffc 2323
0284adc6 2324 case OBJECT_TAG:
507f22bd
ZJS
2325 printf("Type: OBJECT_TAG seqnum=%"PRIu64" epoch=%"PRIu64"\n",
2326 le64toh(o->tag.seqnum),
2327 le64toh(o->tag.epoch));
0284adc6 2328 break;
3c1668da
LP
2329
2330 default:
2331 printf("Type: unknown (%u)\n", o->object.type);
2332 break;
0284adc6 2333 }
7560fffc 2334
0284adc6
LP
2335 if (o->object.flags & OBJECT_COMPRESSED)
2336 printf("Flags: COMPRESSED\n");
7560fffc 2337
0284adc6
LP
2338 if (p == le64toh(f->header->tail_object_offset))
2339 p = 0;
2340 else
2341 p = p + ALIGN64(le64toh(o->object.size));
2342 }
7560fffc 2343
0284adc6
LP
2344 return;
2345fail:
2346 log_error("File corrupt");
7560fffc
LP
2347}
2348
718fe4b1
ZJS
2349static const char* format_timestamp_safe(char *buf, size_t l, usec_t t) {
2350 const char *x;
2351
2352 x = format_timestamp(buf, l, t);
2353 if (x)
2354 return x;
2355 return " --- ";
2356}
2357
0284adc6 2358void journal_file_print_header(JournalFile *f) {
2765b7bb 2359 char a[33], b[33], c[33], d[33];
ed375beb 2360 char x[FORMAT_TIMESTAMP_MAX], y[FORMAT_TIMESTAMP_MAX], z[FORMAT_TIMESTAMP_MAX];
a1a03e30
LP
2361 struct stat st;
2362 char bytes[FORMAT_BYTES_MAX];
7560fffc
LP
2363
2364 assert(f);
7560fffc 2365
0284adc6
LP
2366 printf("File Path: %s\n"
2367 "File ID: %s\n"
2368 "Machine ID: %s\n"
2369 "Boot ID: %s\n"
2370 "Sequential Number ID: %s\n"
2371 "State: %s\n"
2372 "Compatible Flags:%s%s\n"
2373 "Incompatible Flags:%s%s\n"
507f22bd
ZJS
2374 "Header size: %"PRIu64"\n"
2375 "Arena size: %"PRIu64"\n"
2376 "Data Hash Table Size: %"PRIu64"\n"
2377 "Field Hash Table Size: %"PRIu64"\n"
0284adc6 2378 "Rotate Suggested: %s\n"
507f22bd
ZJS
2379 "Head Sequential Number: %"PRIu64"\n"
2380 "Tail Sequential Number: %"PRIu64"\n"
0284adc6 2381 "Head Realtime Timestamp: %s\n"
3223f44f 2382 "Tail Realtime Timestamp: %s\n"
ed375beb 2383 "Tail Monotonic Timestamp: %s\n"
507f22bd
ZJS
2384 "Objects: %"PRIu64"\n"
2385 "Entry Objects: %"PRIu64"\n",
0284adc6
LP
2386 f->path,
2387 sd_id128_to_string(f->header->file_id, a),
2388 sd_id128_to_string(f->header->machine_id, b),
2389 sd_id128_to_string(f->header->boot_id, c),
2765b7bb 2390 sd_id128_to_string(f->header->seqnum_id, d),
3223f44f
LP
2391 f->header->state == STATE_OFFLINE ? "OFFLINE" :
2392 f->header->state == STATE_ONLINE ? "ONLINE" :
2393 f->header->state == STATE_ARCHIVED ? "ARCHIVED" : "UNKNOWN",
8088cbd3
LP
2394 JOURNAL_HEADER_SEALED(f->header) ? " SEALED" : "",
2395 (le32toh(f->header->compatible_flags) & ~HEADER_COMPATIBLE_SEALED) ? " ???" : "",
2396 JOURNAL_HEADER_COMPRESSED(f->header) ? " COMPRESSED" : "",
2397 (le32toh(f->header->incompatible_flags) & ~HEADER_INCOMPATIBLE_COMPRESSED) ? " ???" : "",
507f22bd
ZJS
2398 le64toh(f->header->header_size),
2399 le64toh(f->header->arena_size),
2400 le64toh(f->header->data_hash_table_size) / sizeof(HashItem),
2401 le64toh(f->header->field_hash_table_size) / sizeof(HashItem),
fb0951b0 2402 yes_no(journal_file_rotate_suggested(f, 0)),
507f22bd
ZJS
2403 le64toh(f->header->head_entry_seqnum),
2404 le64toh(f->header->tail_entry_seqnum),
718fe4b1
ZJS
2405 format_timestamp_safe(x, sizeof(x), le64toh(f->header->head_entry_realtime)),
2406 format_timestamp_safe(y, sizeof(y), le64toh(f->header->tail_entry_realtime)),
ed375beb 2407 format_timespan(z, sizeof(z), le64toh(f->header->tail_entry_monotonic), USEC_PER_MSEC),
507f22bd
ZJS
2408 le64toh(f->header->n_objects),
2409 le64toh(f->header->n_entries));
7560fffc 2410
0284adc6 2411 if (JOURNAL_HEADER_CONTAINS(f->header, n_data))
507f22bd 2412 printf("Data Objects: %"PRIu64"\n"
0284adc6 2413 "Data Hash Table Fill: %.1f%%\n",
507f22bd 2414 le64toh(f->header->n_data),
0284adc6 2415 100.0 * (double) le64toh(f->header->n_data) / ((double) (le64toh(f->header->data_hash_table_size) / sizeof(HashItem))));
7560fffc 2416
0284adc6 2417 if (JOURNAL_HEADER_CONTAINS(f->header, n_fields))
507f22bd 2418 printf("Field Objects: %"PRIu64"\n"
0284adc6 2419 "Field Hash Table Fill: %.1f%%\n",
507f22bd 2420 le64toh(f->header->n_fields),
0284adc6 2421 100.0 * (double) le64toh(f->header->n_fields) / ((double) (le64toh(f->header->field_hash_table_size) / sizeof(HashItem))));
3223f44f
LP
2422
2423 if (JOURNAL_HEADER_CONTAINS(f->header, n_tags))
507f22bd
ZJS
2424 printf("Tag Objects: %"PRIu64"\n",
2425 le64toh(f->header->n_tags));
3223f44f 2426 if (JOURNAL_HEADER_CONTAINS(f->header, n_entry_arrays))
507f22bd
ZJS
2427 printf("Entry Array Objects: %"PRIu64"\n",
2428 le64toh(f->header->n_entry_arrays));
a1a03e30
LP
2429
2430 if (fstat(f->fd, &st) >= 0)
2431 printf("Disk usage: %s\n", format_bytes(bytes, sizeof(bytes), (off_t) st.st_blocks * 512ULL));
7560fffc
LP
2432}
2433
0284adc6
LP
2434int journal_file_open(
2435 const char *fname,
2436 int flags,
2437 mode_t mode,
2438 bool compress,
baed47c3 2439 bool seal,
0284adc6
LP
2440 JournalMetrics *metrics,
2441 MMapCache *mmap_cache,
2442 JournalFile *template,
2443 JournalFile **ret) {
7560fffc 2444
0284adc6
LP
2445 JournalFile *f;
2446 int r;
2447 bool newly_created = false;
7560fffc 2448
0284adc6 2449 assert(fname);
0559d3a5 2450 assert(ret);
7560fffc 2451
0284adc6
LP
2452 if ((flags & O_ACCMODE) != O_RDONLY &&
2453 (flags & O_ACCMODE) != O_RDWR)
2454 return -EINVAL;
7560fffc 2455
a0108012
LP
2456 if (!endswith(fname, ".journal") &&
2457 !endswith(fname, ".journal~"))
0284adc6 2458 return -EINVAL;
7560fffc 2459
0284adc6
LP
2460 f = new0(JournalFile, 1);
2461 if (!f)
2462 return -ENOMEM;
7560fffc 2463
0284adc6
LP
2464 f->fd = -1;
2465 f->mode = mode;
7560fffc 2466
0284adc6
LP
2467 f->flags = flags;
2468 f->prot = prot_from_flags(flags);
2469 f->writable = (flags & O_ACCMODE) != O_RDONLY;
48b61739 2470#ifdef HAVE_XZ
0284adc6 2471 f->compress = compress;
48b61739 2472#endif
49a32d43 2473#ifdef HAVE_GCRYPT
baed47c3 2474 f->seal = seal;
49a32d43 2475#endif
7560fffc 2476
0284adc6
LP
2477 if (mmap_cache)
2478 f->mmap = mmap_cache_ref(mmap_cache);
2479 else {
84168d80 2480 f->mmap = mmap_cache_new();
0284adc6
LP
2481 if (!f->mmap) {
2482 r = -ENOMEM;
2483 goto fail;
2484 }
2485 }
7560fffc 2486
0284adc6
LP
2487 f->path = strdup(fname);
2488 if (!f->path) {
2489 r = -ENOMEM;
2490 goto fail;
2491 }
7560fffc 2492
a4bcff5b
LP
2493 f->chain_cache = hashmap_new(uint64_hash_func, uint64_compare_func);
2494 if (!f->chain_cache) {
2495 r = -ENOMEM;
2496 goto fail;
2497 }
2498
0284adc6
LP
2499 f->fd = open(f->path, f->flags|O_CLOEXEC, f->mode);
2500 if (f->fd < 0) {
2501 r = -errno;
2502 goto fail;
7560fffc 2503 }
7560fffc 2504
0284adc6
LP
2505 if (fstat(f->fd, &f->last_stat) < 0) {
2506 r = -errno;
2507 goto fail;
2508 }
7560fffc 2509
0284adc6 2510 if (f->last_stat.st_size == 0 && f->writable) {
fb0951b0
LP
2511 uint64_t crtime;
2512
2513 /* Let's attach the creation time to the journal file,
2514 * so that the vacuuming code knows the age of this
2515 * file even if the file might end up corrupted one
2516 * day... Ideally we'd just use the creation time many
2517 * file systems maintain for each file, but there is
2518 * currently no usable API to query this, hence let's
2519 * emulate this via extended attributes. If extended
2520 * attributes are not supported we'll just skip this,
2521 * and rely solely on mtime/atime/ctime of the file.*/
2522
2523 crtime = htole64((uint64_t) now(CLOCK_REALTIME));
2524 fsetxattr(f->fd, "user.crtime_usec", &crtime, sizeof(crtime), XATTR_CREATE);
7560fffc 2525
feb12d3e 2526#ifdef HAVE_GCRYPT
0284adc6 2527 /* Try to load the FSPRG state, and if we can't, then
baed47c3 2528 * just don't do sealing */
49a32d43
LP
2529 if (f->seal) {
2530 r = journal_file_fss_load(f);
2531 if (r < 0)
2532 f->seal = false;
2533 }
feb12d3e 2534#endif
7560fffc 2535
0284adc6
LP
2536 r = journal_file_init_header(f, template);
2537 if (r < 0)
2538 goto fail;
7560fffc 2539
0284adc6
LP
2540 if (fstat(f->fd, &f->last_stat) < 0) {
2541 r = -errno;
2542 goto fail;
2543 }
fb0951b0
LP
2544
2545 newly_created = true;
0284adc6 2546 }
7560fffc 2547
0284adc6
LP
2548 if (f->last_stat.st_size < (off_t) HEADER_SIZE_MIN) {
2549 r = -EIO;
2550 goto fail;
2551 }
7560fffc 2552
0284adc6
LP
2553 f->header = mmap(NULL, PAGE_ALIGN(sizeof(Header)), prot_from_flags(flags), MAP_SHARED, f->fd, 0);
2554 if (f->header == MAP_FAILED) {
2555 f->header = NULL;
2556 r = -errno;
2557 goto fail;
2558 }
7560fffc 2559
0284adc6
LP
2560 if (!newly_created) {
2561 r = journal_file_verify_header(f);
2562 if (r < 0)
2563 goto fail;
2564 }
7560fffc 2565
feb12d3e 2566#ifdef HAVE_GCRYPT
0284adc6 2567 if (!newly_created && f->writable) {
baed47c3 2568 r = journal_file_fss_load(f);
0284adc6
LP
2569 if (r < 0)
2570 goto fail;
2571 }
feb12d3e 2572#endif
cec736d2
LP
2573
2574 if (f->writable) {
4a92baf3
LP
2575 if (metrics) {
2576 journal_default_metrics(metrics, f->fd);
2577 f->metrics = *metrics;
2578 } else if (template)
2579 f->metrics = template->metrics;
2580
cec736d2
LP
2581 r = journal_file_refresh_header(f);
2582 if (r < 0)
2583 goto fail;
2584 }
2585
feb12d3e 2586#ifdef HAVE_GCRYPT
baed47c3 2587 r = journal_file_hmac_setup(f);
14d10188
LP
2588 if (r < 0)
2589 goto fail;
feb12d3e 2590#endif
14d10188 2591
cec736d2 2592 if (newly_created) {
de190aef 2593 r = journal_file_setup_field_hash_table(f);
cec736d2
LP
2594 if (r < 0)
2595 goto fail;
2596
de190aef 2597 r = journal_file_setup_data_hash_table(f);
cec736d2
LP
2598 if (r < 0)
2599 goto fail;
7560fffc 2600
feb12d3e 2601#ifdef HAVE_GCRYPT
7560fffc
LP
2602 r = journal_file_append_first_tag(f);
2603 if (r < 0)
2604 goto fail;
feb12d3e 2605#endif
cec736d2
LP
2606 }
2607
de190aef 2608 r = journal_file_map_field_hash_table(f);
cec736d2
LP
2609 if (r < 0)
2610 goto fail;
2611
de190aef 2612 r = journal_file_map_data_hash_table(f);
cec736d2
LP
2613 if (r < 0)
2614 goto fail;
2615
0559d3a5 2616 *ret = f;
cec736d2
LP
2617 return 0;
2618
2619fail:
2620 journal_file_close(f);
2621
2622 return r;
2623}
0ac38b70 2624
baed47c3 2625int journal_file_rotate(JournalFile **f, bool compress, bool seal) {
57535f47 2626 _cleanup_free_ char *p = NULL;
0ac38b70
LP
2627 size_t l;
2628 JournalFile *old_file, *new_file = NULL;
2629 int r;
2630
2631 assert(f);
2632 assert(*f);
2633
2634 old_file = *f;
2635
2636 if (!old_file->writable)
2637 return -EINVAL;
2638
2639 if (!endswith(old_file->path, ".journal"))
2640 return -EINVAL;
2641
2642 l = strlen(old_file->path);
57535f47
ZJS
2643 r = asprintf(&p, "%.*s@" SD_ID128_FORMAT_STR "-%016"PRIx64"-%016"PRIx64".journal",
2644 (int) l - 8, old_file->path,
2645 SD_ID128_FORMAT_VAL(old_file->header->seqnum_id),
2646 le64toh((*f)->header->head_entry_seqnum),
2647 le64toh((*f)->header->head_entry_realtime));
2648 if (r < 0)
0ac38b70
LP
2649 return -ENOMEM;
2650
0ac38b70 2651 r = rename(old_file->path, p);
0ac38b70
LP
2652 if (r < 0)
2653 return -errno;
2654
ccdbaf91 2655 old_file->header->state = STATE_ARCHIVED;
0ac38b70 2656
baed47c3 2657 r = journal_file_open(old_file->path, old_file->flags, old_file->mode, compress, seal, NULL, old_file->mmap, old_file, &new_file);
0ac38b70
LP
2658 journal_file_close(old_file);
2659
2660 *f = new_file;
2661 return r;
2662}
2663
9447a7f1
LP
2664int journal_file_open_reliably(
2665 const char *fname,
2666 int flags,
2667 mode_t mode,
7560fffc 2668 bool compress,
baed47c3 2669 bool seal,
4a92baf3 2670 JournalMetrics *metrics,
27370278 2671 MMapCache *mmap_cache,
9447a7f1
LP
2672 JournalFile *template,
2673 JournalFile **ret) {
2674
2675 int r;
2676 size_t l;
ed375beb 2677 _cleanup_free_ char *p = NULL;
9447a7f1 2678
baed47c3 2679 r = journal_file_open(fname, flags, mode, compress, seal,
27370278 2680 metrics, mmap_cache, template, ret);
0071d9f1
LP
2681 if (r != -EBADMSG && /* corrupted */
2682 r != -ENODATA && /* truncated */
2683 r != -EHOSTDOWN && /* other machine */
a1a1898f
LP
2684 r != -EPROTONOSUPPORT && /* incompatible feature */
2685 r != -EBUSY && /* unclean shutdown */
2686 r != -ESHUTDOWN /* already archived */)
9447a7f1
LP
2687 return r;
2688
2689 if ((flags & O_ACCMODE) == O_RDONLY)
2690 return r;
2691
2692 if (!(flags & O_CREAT))
2693 return r;
2694
7560fffc
LP
2695 if (!endswith(fname, ".journal"))
2696 return r;
2697
5c70eab4
LP
2698 /* The file is corrupted. Rotate it away and try it again (but only once) */
2699
9447a7f1 2700 l = strlen(fname);
9bf3b535 2701 if (asprintf(&p, "%.*s@%016llx-%016" PRIx64 ".journal~",
57535f47 2702 (int) l - 8, fname,
9447a7f1 2703 (unsigned long long) now(CLOCK_REALTIME),
9bf3b535 2704 random_u64()) < 0)
9447a7f1
LP
2705 return -ENOMEM;
2706
2707 r = rename(fname, p);
9447a7f1
LP
2708 if (r < 0)
2709 return -errno;
2710
a1a1898f 2711 log_warning("File %s corrupted or uncleanly shut down, renaming and replacing.", fname);
9447a7f1 2712
baed47c3 2713 return journal_file_open(fname, flags, mode, compress, seal,
27370278 2714 metrics, mmap_cache, template, ret);
9447a7f1
LP
2715}
2716
cf244689
LP
2717int journal_file_copy_entry(JournalFile *from, JournalFile *to, Object *o, uint64_t p, uint64_t *seqnum, Object **ret, uint64_t *offset) {
2718 uint64_t i, n;
2719 uint64_t q, xor_hash = 0;
2720 int r;
2721 EntryItem *items;
2722 dual_timestamp ts;
2723
2724 assert(from);
2725 assert(to);
2726 assert(o);
2727 assert(p);
2728
2729 if (!to->writable)
2730 return -EPERM;
2731
2732 ts.monotonic = le64toh(o->entry.monotonic);
2733 ts.realtime = le64toh(o->entry.realtime);
2734
cf244689 2735 n = journal_file_entry_n_items(o);
4faa7004
TA
2736 /* alloca() can't take 0, hence let's allocate at least one */
2737 items = alloca(sizeof(EntryItem) * MAX(1u, n));
cf244689
LP
2738
2739 for (i = 0; i < n; i++) {
4fd052ae
FC
2740 uint64_t l, h;
2741 le64_t le_hash;
cf244689
LP
2742 size_t t;
2743 void *data;
2744 Object *u;
2745
2746 q = le64toh(o->entry.items[i].object_offset);
2747 le_hash = o->entry.items[i].hash;
2748
2749 r = journal_file_move_to_object(from, OBJECT_DATA, q, &o);
2750 if (r < 0)
2751 return r;
2752
2753 if (le_hash != o->data.hash)
2754 return -EBADMSG;
2755
2756 l = le64toh(o->object.size) - offsetof(Object, data.payload);
2757 t = (size_t) l;
2758
2759 /* We hit the limit on 32bit machines */
2760 if ((uint64_t) t != l)
2761 return -E2BIG;
2762
2763 if (o->object.flags & OBJECT_COMPRESSED) {
2764#ifdef HAVE_XZ
2765 uint64_t rsize;
2766
93b73b06 2767 if (!uncompress_blob(o->data.payload, l, &from->compress_buffer, &from->compress_buffer_size, &rsize, 0))
cf244689
LP
2768 return -EBADMSG;
2769
2770 data = from->compress_buffer;
2771 l = rsize;
2772#else
2773 return -EPROTONOSUPPORT;
2774#endif
2775 } else
2776 data = o->data.payload;
2777
2778 r = journal_file_append_data(to, data, l, &u, &h);
2779 if (r < 0)
2780 return r;
2781
2782 xor_hash ^= le64toh(u->data.hash);
2783 items[i].object_offset = htole64(h);
2784 items[i].hash = u->data.hash;
2785
2786 r = journal_file_move_to_object(from, OBJECT_ENTRY, p, &o);
2787 if (r < 0)
2788 return r;
2789 }
2790
2791 return journal_file_append_entry_internal(to, &ts, xor_hash, items, n, seqnum, ret, offset);
2792}
babfc091
LP
2793
2794void journal_default_metrics(JournalMetrics *m, int fd) {
2795 uint64_t fs_size = 0;
2796 struct statvfs ss;
a7bc2c2a 2797 char a[FORMAT_BYTES_MAX], b[FORMAT_BYTES_MAX], c[FORMAT_BYTES_MAX], d[FORMAT_BYTES_MAX];
babfc091
LP
2798
2799 assert(m);
2800 assert(fd >= 0);
2801
2802 if (fstatvfs(fd, &ss) >= 0)
2803 fs_size = ss.f_frsize * ss.f_blocks;
2804
2805 if (m->max_use == (uint64_t) -1) {
2806
2807 if (fs_size > 0) {
2808 m->max_use = PAGE_ALIGN(fs_size / 10); /* 10% of file system size */
2809
2810 if (m->max_use > DEFAULT_MAX_USE_UPPER)
2811 m->max_use = DEFAULT_MAX_USE_UPPER;
2812
2813 if (m->max_use < DEFAULT_MAX_USE_LOWER)
2814 m->max_use = DEFAULT_MAX_USE_LOWER;
2815 } else
2816 m->max_use = DEFAULT_MAX_USE_LOWER;
2817 } else {
2818 m->max_use = PAGE_ALIGN(m->max_use);
2819
2820 if (m->max_use < JOURNAL_FILE_SIZE_MIN*2)
2821 m->max_use = JOURNAL_FILE_SIZE_MIN*2;
2822 }
2823
2824 if (m->max_size == (uint64_t) -1) {
2825 m->max_size = PAGE_ALIGN(m->max_use / 8); /* 8 chunks */
2826
2827 if (m->max_size > DEFAULT_MAX_SIZE_UPPER)
2828 m->max_size = DEFAULT_MAX_SIZE_UPPER;
2829 } else
2830 m->max_size = PAGE_ALIGN(m->max_size);
2831
2832 if (m->max_size < JOURNAL_FILE_SIZE_MIN)
2833 m->max_size = JOURNAL_FILE_SIZE_MIN;
2834
2835 if (m->max_size*2 > m->max_use)
2836 m->max_use = m->max_size*2;
2837
2838 if (m->min_size == (uint64_t) -1)
2839 m->min_size = JOURNAL_FILE_SIZE_MIN;
2840 else {
2841 m->min_size = PAGE_ALIGN(m->min_size);
2842
2843 if (m->min_size < JOURNAL_FILE_SIZE_MIN)
2844 m->min_size = JOURNAL_FILE_SIZE_MIN;
2845
2846 if (m->min_size > m->max_size)
2847 m->max_size = m->min_size;
2848 }
2849
2850 if (m->keep_free == (uint64_t) -1) {
2851
2852 if (fs_size > 0) {
8621b110 2853 m->keep_free = PAGE_ALIGN(fs_size * 3 / 20); /* 15% of file system size */
babfc091
LP
2854
2855 if (m->keep_free > DEFAULT_KEEP_FREE_UPPER)
2856 m->keep_free = DEFAULT_KEEP_FREE_UPPER;
2857
2858 } else
2859 m->keep_free = DEFAULT_KEEP_FREE;
2860 }
2861
2b43f939
LP
2862 log_debug("Fixed max_use=%s max_size=%s min_size=%s keep_free=%s",
2863 format_bytes(a, sizeof(a), m->max_use),
2864 format_bytes(b, sizeof(b), m->max_size),
2865 format_bytes(c, sizeof(c), m->min_size),
2866 format_bytes(d, sizeof(d), m->keep_free));
babfc091 2867}
08984293
LP
2868
2869int journal_file_get_cutoff_realtime_usec(JournalFile *f, usec_t *from, usec_t *to) {
08984293
LP
2870 assert(f);
2871 assert(from || to);
2872
2873 if (from) {
162566a4
LP
2874 if (f->header->head_entry_realtime == 0)
2875 return -ENOENT;
08984293 2876
162566a4 2877 *from = le64toh(f->header->head_entry_realtime);
08984293
LP
2878 }
2879
2880 if (to) {
162566a4
LP
2881 if (f->header->tail_entry_realtime == 0)
2882 return -ENOENT;
08984293 2883
162566a4 2884 *to = le64toh(f->header->tail_entry_realtime);
08984293
LP
2885 }
2886
2887 return 1;
2888}
2889
2890int journal_file_get_cutoff_monotonic_usec(JournalFile *f, sd_id128_t boot_id, usec_t *from, usec_t *to) {
08984293
LP
2891 Object *o;
2892 uint64_t p;
2893 int r;
2894
2895 assert(f);
2896 assert(from || to);
2897
47838ab3 2898 r = find_data_object_by_boot_id(f, boot_id, &o, &p);
08984293
LP
2899 if (r <= 0)
2900 return r;
2901
2902 if (le64toh(o->data.n_entries) <= 0)
2903 return 0;
2904
2905 if (from) {
2906 r = journal_file_move_to_object(f, OBJECT_ENTRY, le64toh(o->data.entry_offset), &o);
2907 if (r < 0)
2908 return r;
2909
2910 *from = le64toh(o->entry.monotonic);
2911 }
2912
2913 if (to) {
2914 r = journal_file_move_to_object(f, OBJECT_DATA, p, &o);
2915 if (r < 0)
2916 return r;
2917
2918 r = generic_array_get_plus_one(f,
2919 le64toh(o->data.entry_offset),
2920 le64toh(o->data.entry_array_offset),
2921 le64toh(o->data.n_entries)-1,
2922 &o, NULL);
2923 if (r <= 0)
2924 return r;
2925
2926 *to = le64toh(o->entry.monotonic);
2927 }
2928
2929 return 1;
2930}
dca6219e 2931
fb0951b0 2932bool journal_file_rotate_suggested(JournalFile *f, usec_t max_file_usec) {
dca6219e
LP
2933 assert(f);
2934
2935 /* If we gained new header fields we gained new features,
2936 * hence suggest a rotation */
361f9cbc
LP
2937 if (le64toh(f->header->header_size) < sizeof(Header)) {
2938 log_debug("%s uses an outdated header, suggesting rotation.", f->path);
dca6219e 2939 return true;
361f9cbc 2940 }
dca6219e
LP
2941
2942 /* Let's check if the hash tables grew over a certain fill
2943 * level (75%, borrowing this value from Java's hash table
2944 * implementation), and if so suggest a rotation. To calculate
2945 * the fill level we need the n_data field, which only exists
2946 * in newer versions. */
2947
2948 if (JOURNAL_HEADER_CONTAINS(f->header, n_data))
361f9cbc 2949 if (le64toh(f->header->n_data) * 4ULL > (le64toh(f->header->data_hash_table_size) / sizeof(HashItem)) * 3ULL) {
507f22bd 2950 log_debug("Data hash table of %s has a fill level at %.1f (%"PRIu64" of %"PRIu64" items, %llu file size, %"PRIu64" bytes per hash table item), suggesting rotation.",
361f9cbc
LP
2951 f->path,
2952 100.0 * (double) le64toh(f->header->n_data) / ((double) (le64toh(f->header->data_hash_table_size) / sizeof(HashItem))),
507f22bd
ZJS
2953 le64toh(f->header->n_data),
2954 le64toh(f->header->data_hash_table_size) / sizeof(HashItem),
2955 (unsigned long long) f->last_stat.st_size,
2956 f->last_stat.st_size / le64toh(f->header->n_data));
dca6219e 2957 return true;
361f9cbc 2958 }
dca6219e
LP
2959
2960 if (JOURNAL_HEADER_CONTAINS(f->header, n_fields))
361f9cbc 2961 if (le64toh(f->header->n_fields) * 4ULL > (le64toh(f->header->field_hash_table_size) / sizeof(HashItem)) * 3ULL) {
507f22bd 2962 log_debug("Field hash table of %s has a fill level at %.1f (%"PRIu64" of %"PRIu64" items), suggesting rotation.",
361f9cbc
LP
2963 f->path,
2964 100.0 * (double) le64toh(f->header->n_fields) / ((double) (le64toh(f->header->field_hash_table_size) / sizeof(HashItem))),
507f22bd
ZJS
2965 le64toh(f->header->n_fields),
2966 le64toh(f->header->field_hash_table_size) / sizeof(HashItem));
dca6219e 2967 return true;
361f9cbc 2968 }
dca6219e 2969
0598fd4a
LP
2970 /* Are the data objects properly indexed by field objects? */
2971 if (JOURNAL_HEADER_CONTAINS(f->header, n_data) &&
2972 JOURNAL_HEADER_CONTAINS(f->header, n_fields) &&
2973 le64toh(f->header->n_data) > 0 &&
2974 le64toh(f->header->n_fields) == 0)
2975 return true;
2976
fb0951b0
LP
2977 if (max_file_usec > 0) {
2978 usec_t t, h;
2979
2980 h = le64toh(f->header->head_entry_realtime);
2981 t = now(CLOCK_REALTIME);
2982
2983 if (h > 0 && t > h + max_file_usec)
2984 return true;
2985 }
2986
dca6219e
LP
2987 return false;
2988}