]> git.ipfire.org Git - thirdparty/systemd.git/blame - src/journal/journal-file.c
journal: move definition of LocationType to journal-file.h
[thirdparty/systemd.git] / src / journal / journal-file.c
CommitLineData
cec736d2
LP
1/*-*- Mode: C; c-basic-offset: 8; indent-tabs-mode: nil -*-*/
2
3/***
4 This file is part of systemd.
5
6 Copyright 2011 Lennart Poettering
7
8 systemd is free software; you can redistribute it and/or modify it
5430f7f2
LP
9 under the terms of the GNU Lesser General Public License as published by
10 the Free Software Foundation; either version 2.1 of the License, or
cec736d2
LP
11 (at your option) any later version.
12
13 systemd is distributed in the hope that it will be useful, but
14 WITHOUT ANY WARRANTY; without even the implied warranty of
15 MERCHANTABILITY or FITNESS FOR A PARTICULAR PURPOSE. See the GNU
5430f7f2 16 Lesser General Public License for more details.
cec736d2 17
5430f7f2 18 You should have received a copy of the GNU Lesser General Public License
cec736d2
LP
19 along with systemd; If not, see <http://www.gnu.org/licenses/>.
20***/
21
22#include <sys/mman.h>
23#include <errno.h>
24#include <sys/uio.h>
25#include <unistd.h>
26#include <sys/statvfs.h>
27#include <fcntl.h>
28#include <stddef.h>
d2edfae0 29#include <sys/xattr.h>
fb0951b0 30
cec736d2
LP
31#include "journal-def.h"
32#include "journal-file.h"
0284adc6 33#include "journal-authenticate.h"
cec736d2 34#include "lookup3.h"
807e17f0 35#include "compress.h"
7560fffc 36#include "fsprg.h"
cec736d2 37
4a92baf3
LP
38#define DEFAULT_DATA_HASH_TABLE_SIZE (2047ULL*sizeof(HashItem))
39#define DEFAULT_FIELD_HASH_TABLE_SIZE (333ULL*sizeof(HashItem))
cec736d2 40
be19b7df 41#define COMPRESSION_SIZE_THRESHOLD (512ULL)
807e17f0 42
babfc091 43/* This is the minimum journal file size */
253f59df 44#define JOURNAL_FILE_SIZE_MIN (4ULL*1024ULL*1024ULL) /* 4 MiB */
babfc091
LP
45
46/* These are the lower and upper bounds if we deduce the max_use value
47 * from the file system size */
48#define DEFAULT_MAX_USE_LOWER (1ULL*1024ULL*1024ULL) /* 1 MiB */
49#define DEFAULT_MAX_USE_UPPER (4ULL*1024ULL*1024ULL*1024ULL) /* 4 GiB */
50
51/* This is the upper bound if we deduce max_size from max_use */
71100051 52#define DEFAULT_MAX_SIZE_UPPER (128ULL*1024ULL*1024ULL) /* 128 MiB */
babfc091
LP
53
54/* This is the upper bound if we deduce the keep_free value from the
55 * file system size */
56#define DEFAULT_KEEP_FREE_UPPER (4ULL*1024ULL*1024ULL*1024ULL) /* 4 GiB */
57
58/* This is the keep_free value when we can't determine the system
59 * size */
60#define DEFAULT_KEEP_FREE (1024ULL*1024ULL) /* 1 MB */
61
dca6219e
LP
62/* n_data was the first entry we added after the initial file format design */
63#define HEADER_SIZE_MIN ALIGN64(offsetof(Header, n_data))
cec736d2 64
a4bcff5b
LP
65/* How many entries to keep in the entry array chain cache at max */
66#define CHAIN_CACHE_MAX 20
67
a676e665
LP
68/* How much to increase the journal file size at once each time we allocate something new. */
69#define FILE_SIZE_INCREASE (8ULL*1024ULL*1024ULL) /* 8MB */
70
9588bc32 71static int journal_file_set_online(JournalFile *f) {
26687bf8
OS
72 assert(f);
73
74 if (!f->writable)
75 return -EPERM;
76
77 if (!(f->fd >= 0 && f->header))
78 return -EINVAL;
79
80 switch(f->header->state) {
81 case STATE_ONLINE:
82 return 0;
83
84 case STATE_OFFLINE:
85 f->header->state = STATE_ONLINE;
86 fsync(f->fd);
87 return 0;
88
89 default:
90 return -EINVAL;
91 }
92}
93
94int journal_file_set_offline(JournalFile *f) {
95 assert(f);
96
97 if (!f->writable)
98 return -EPERM;
99
100 if (!(f->fd >= 0 && f->header))
101 return -EINVAL;
102
103 if (f->header->state != STATE_ONLINE)
104 return 0;
105
106 fsync(f->fd);
107
108 f->header->state = STATE_OFFLINE;
109
110 fsync(f->fd);
111
112 return 0;
113}
114
cec736d2 115void journal_file_close(JournalFile *f) {
de190aef 116 assert(f);
cec736d2 117
feb12d3e 118#ifdef HAVE_GCRYPT
b0af6f41 119 /* Write the final tag */
c586dbf1 120 if (f->seal && f->writable)
b0af6f41 121 journal_file_append_tag(f);
feb12d3e 122#endif
b0af6f41 123
7560fffc 124 /* Sync everything to disk, before we mark the file offline */
16e9f408
LP
125 if (f->mmap && f->fd >= 0)
126 mmap_cache_close_fd(f->mmap, f->fd);
7560fffc 127
26687bf8 128 journal_file_set_offline(f);
cec736d2 129
26687bf8 130 if (f->header)
d384c7a8 131 munmap(f->header, PAGE_ALIGN(sizeof(Header)));
cec736d2 132
03e334a1 133 safe_close(f->fd);
cec736d2 134 free(f->path);
807e17f0 135
16e9f408
LP
136 if (f->mmap)
137 mmap_cache_unref(f->mmap);
138
4743015d 139 ordered_hashmap_free_free(f->chain_cache);
a4bcff5b 140
d89c8fdf 141#if defined(HAVE_XZ) || defined(HAVE_LZ4)
807e17f0
LP
142 free(f->compress_buffer);
143#endif
144
7560fffc 145#ifdef HAVE_GCRYPT
baed47c3
LP
146 if (f->fss_file)
147 munmap(f->fss_file, PAGE_ALIGN(f->fss_file_size));
b7c9ae91
LP
148 else if (f->fsprg_state)
149 free(f->fsprg_state);
150
151 free(f->fsprg_seed);
7560fffc
LP
152
153 if (f->hmac)
154 gcry_md_close(f->hmac);
155#endif
156
cec736d2
LP
157 free(f);
158}
159
0ac38b70 160static int journal_file_init_header(JournalFile *f, JournalFile *template) {
d89c8fdf 161 Header h = {};
cec736d2
LP
162 ssize_t k;
163 int r;
164
165 assert(f);
166
7560fffc 167 memcpy(h.signature, HEADER_SIGNATURE, 8);
23b0b2b2 168 h.header_size = htole64(ALIGN64(sizeof(h)));
cec736d2 169
d89c8fdf
ZJS
170 h.incompatible_flags |= htole32(
171 f->compress_xz * HEADER_INCOMPATIBLE_COMPRESSED_XZ |
172 f->compress_lz4 * HEADER_INCOMPATIBLE_COMPRESSED_LZ4);
7560fffc 173
d89c8fdf
ZJS
174 h.compatible_flags = htole32(
175 f->seal * HEADER_COMPATIBLE_SEALED);
7560fffc 176
cec736d2
LP
177 r = sd_id128_randomize(&h.file_id);
178 if (r < 0)
179 return r;
180
0ac38b70
LP
181 if (template) {
182 h.seqnum_id = template->header->seqnum_id;
beec0085 183 h.tail_entry_seqnum = template->header->tail_entry_seqnum;
0ac38b70
LP
184 } else
185 h.seqnum_id = h.file_id;
cec736d2
LP
186
187 k = pwrite(f->fd, &h, sizeof(h), 0);
188 if (k < 0)
189 return -errno;
190
191 if (k != sizeof(h))
192 return -EIO;
193
194 return 0;
195}
196
197static int journal_file_refresh_header(JournalFile *f) {
198 int r;
de190aef 199 sd_id128_t boot_id;
cec736d2
LP
200
201 assert(f);
202
203 r = sd_id128_get_machine(&f->header->machine_id);
204 if (r < 0)
205 return r;
206
de190aef 207 r = sd_id128_get_boot(&boot_id);
cec736d2
LP
208 if (r < 0)
209 return r;
210
de190aef
LP
211 if (sd_id128_equal(boot_id, f->header->boot_id))
212 f->tail_entry_monotonic_valid = true;
213
214 f->header->boot_id = boot_id;
215
26687bf8 216 journal_file_set_online(f);
b788cc23 217
7560fffc 218 /* Sync the online state to disk */
a676e665 219 fsync(f->fd);
b788cc23 220
cec736d2
LP
221 return 0;
222}
223
224static int journal_file_verify_header(JournalFile *f) {
d89c8fdf
ZJS
225 uint32_t flags;
226
cec736d2
LP
227 assert(f);
228
7560fffc 229 if (memcmp(f->header->signature, HEADER_SIGNATURE, 8))
cec736d2
LP
230 return -EBADMSG;
231
7560fffc
LP
232 /* In both read and write mode we refuse to open files with
233 * incompatible flags we don't know */
d89c8fdf
ZJS
234 flags = le32toh(f->header->incompatible_flags);
235 if (flags & ~HEADER_INCOMPATIBLE_SUPPORTED) {
236 if (flags & ~HEADER_INCOMPATIBLE_ANY)
237 log_debug("Journal file %s has unknown incompatible flags %"PRIx32,
238 f->path, flags & ~HEADER_INCOMPATIBLE_ANY);
239 flags = (flags & HEADER_INCOMPATIBLE_ANY) & ~HEADER_INCOMPATIBLE_SUPPORTED;
240 if (flags)
241 log_debug("Journal file %s uses incompatible flags %"PRIx32
242 " disabled at compilation time.", f->path, flags);
cec736d2 243 return -EPROTONOSUPPORT;
d89c8fdf 244 }
cec736d2 245
7560fffc
LP
246 /* When open for writing we refuse to open files with
247 * compatible flags, too */
d89c8fdf
ZJS
248 flags = le32toh(f->header->compatible_flags);
249 if (f->writable && (flags & ~HEADER_COMPATIBLE_SUPPORTED)) {
250 if (flags & ~HEADER_COMPATIBLE_ANY)
251 log_debug("Journal file %s has unknown compatible flags %"PRIx32,
252 f->path, flags & ~HEADER_COMPATIBLE_ANY);
253 flags = (flags & HEADER_COMPATIBLE_ANY) & ~HEADER_COMPATIBLE_SUPPORTED;
254 if (flags)
255 log_debug("Journal file %s uses compatible flags %"PRIx32
256 " disabled at compilation time.", f->path, flags);
257 return -EPROTONOSUPPORT;
7560fffc
LP
258 }
259
db11ac1a
LP
260 if (f->header->state >= _STATE_MAX)
261 return -EBADMSG;
262
dca6219e
LP
263 /* The first addition was n_data, so check that we are at least this large */
264 if (le64toh(f->header->header_size) < HEADER_SIZE_MIN)
23b0b2b2
LP
265 return -EBADMSG;
266
8088cbd3 267 if (JOURNAL_HEADER_SEALED(f->header) && !JOURNAL_HEADER_CONTAINS(f->header, n_entry_arrays))
beec0085
LP
268 return -EBADMSG;
269
db11ac1a
LP
270 if ((le64toh(f->header->header_size) + le64toh(f->header->arena_size)) > (uint64_t) f->last_stat.st_size)
271 return -ENODATA;
272
273 if (le64toh(f->header->tail_object_offset) > (le64toh(f->header->header_size) + le64toh(f->header->arena_size)))
274 return -ENODATA;
275
7762e02b
LP
276 if (!VALID64(le64toh(f->header->data_hash_table_offset)) ||
277 !VALID64(le64toh(f->header->field_hash_table_offset)) ||
278 !VALID64(le64toh(f->header->tail_object_offset)) ||
279 !VALID64(le64toh(f->header->entry_array_offset)))
280 return -ENODATA;
281
cec736d2 282 if (f->writable) {
ccdbaf91 283 uint8_t state;
cec736d2
LP
284 sd_id128_t machine_id;
285 int r;
286
287 r = sd_id128_get_machine(&machine_id);
288 if (r < 0)
289 return r;
290
291 if (!sd_id128_equal(machine_id, f->header->machine_id))
292 return -EHOSTDOWN;
293
de190aef 294 state = f->header->state;
cec736d2 295
71fa6f00
LP
296 if (state == STATE_ONLINE) {
297 log_debug("Journal file %s is already online. Assuming unclean closing.", f->path);
298 return -EBUSY;
299 } else if (state == STATE_ARCHIVED)
cec736d2 300 return -ESHUTDOWN;
71fa6f00
LP
301 else if (state != STATE_OFFLINE) {
302 log_debug("Journal file %s has unknown state %u.", f->path, state);
303 return -EBUSY;
304 }
cec736d2
LP
305 }
306
d89c8fdf
ZJS
307 f->compress_xz = JOURNAL_HEADER_COMPRESSED_XZ(f->header);
308 f->compress_lz4 = JOURNAL_HEADER_COMPRESSED_LZ4(f->header);
c586dbf1 309
f1889c91 310 f->seal = JOURNAL_HEADER_SEALED(f->header);
7560fffc 311
cec736d2
LP
312 return 0;
313}
314
315static int journal_file_allocate(JournalFile *f, uint64_t offset, uint64_t size) {
eda4b58b 316 uint64_t old_size, new_size;
fec2aa2f 317 int r;
cec736d2
LP
318
319 assert(f);
320
cec736d2 321 /* We assume that this file is not sparse, and we know that
38ac38b2 322 * for sure, since we always call posix_fallocate()
cec736d2
LP
323 * ourselves */
324
325 old_size =
23b0b2b2 326 le64toh(f->header->header_size) +
cec736d2
LP
327 le64toh(f->header->arena_size);
328
bc85bfee 329 new_size = PAGE_ALIGN(offset + size);
23b0b2b2
LP
330 if (new_size < le64toh(f->header->header_size))
331 new_size = le64toh(f->header->header_size);
bc85bfee
LP
332
333 if (new_size <= old_size)
cec736d2
LP
334 return 0;
335
a676e665 336 if (f->metrics.max_size > 0 && new_size > f->metrics.max_size)
bc85bfee 337 return -E2BIG;
cec736d2 338
a676e665 339 if (new_size > f->metrics.min_size && f->metrics.keep_free > 0) {
cec736d2
LP
340 struct statvfs svfs;
341
342 if (fstatvfs(f->fd, &svfs) >= 0) {
343 uint64_t available;
344
345 available = svfs.f_bfree * svfs.f_bsize;
346
bc85bfee
LP
347 if (available >= f->metrics.keep_free)
348 available -= f->metrics.keep_free;
cec736d2
LP
349 else
350 available = 0;
351
352 if (new_size - old_size > available)
353 return -E2BIG;
354 }
355 }
356
eda4b58b
LP
357 /* Increase by larger blocks at once */
358 new_size = ((new_size+FILE_SIZE_INCREASE-1) / FILE_SIZE_INCREASE) * FILE_SIZE_INCREASE;
359 if (f->metrics.max_size > 0 && new_size > f->metrics.max_size)
360 new_size = f->metrics.max_size;
361
bc85bfee
LP
362 /* Note that the glibc fallocate() fallback is very
363 inefficient, hence we try to minimize the allocation area
364 as we can. */
fec2aa2f
GV
365 r = posix_fallocate(f->fd, old_size, new_size - old_size);
366 if (r != 0)
367 return -r;
cec736d2 368
eda4b58b
LP
369 if (fstat(f->fd, &f->last_stat) < 0)
370 return -errno;
cec736d2 371
23b0b2b2 372 f->header->arena_size = htole64(new_size - le64toh(f->header->header_size));
cec736d2
LP
373
374 return 0;
375}
376
78519831 377static unsigned type_to_context(ObjectType type) {
d3d3208f 378 /* One context for each type, plus one catch-all for the rest */
69adae51 379 assert_cc(_OBJECT_TYPE_MAX <= MMAP_CACHE_MAX_CONTEXTS);
d05089d8 380 return type > OBJECT_UNUSED && type < _OBJECT_TYPE_MAX ? type : 0;
d3d3208f
MS
381}
382
7a9dabea 383static int journal_file_move_to(JournalFile *f, ObjectType type, bool keep_always, uint64_t offset, uint64_t size, void **ret) {
cec736d2 384 assert(f);
cec736d2
LP
385 assert(ret);
386
7762e02b
LP
387 if (size <= 0)
388 return -EINVAL;
389
2a59ea54 390 /* Avoid SIGBUS on invalid accesses */
4bbdcdb3
LP
391 if (offset + size > (uint64_t) f->last_stat.st_size) {
392 /* Hmm, out of range? Let's refresh the fstat() data
393 * first, before we trust that check. */
394
395 if (fstat(f->fd, &f->last_stat) < 0 ||
396 offset + size > (uint64_t) f->last_stat.st_size)
397 return -EADDRNOTAVAIL;
398 }
399
7a9dabea 400 return mmap_cache_get(f->mmap, f->fd, f->prot, type_to_context(type), keep_always, offset, size, &f->last_stat, ret);
cec736d2
LP
401}
402
16e9f408
LP
403static uint64_t minimum_header_size(Object *o) {
404
b8e891e6 405 static const uint64_t table[] = {
16e9f408
LP
406 [OBJECT_DATA] = sizeof(DataObject),
407 [OBJECT_FIELD] = sizeof(FieldObject),
408 [OBJECT_ENTRY] = sizeof(EntryObject),
409 [OBJECT_DATA_HASH_TABLE] = sizeof(HashTableObject),
410 [OBJECT_FIELD_HASH_TABLE] = sizeof(HashTableObject),
411 [OBJECT_ENTRY_ARRAY] = sizeof(EntryArrayObject),
412 [OBJECT_TAG] = sizeof(TagObject),
413 };
414
415 if (o->object.type >= ELEMENTSOF(table) || table[o->object.type] <= 0)
416 return sizeof(ObjectHeader);
417
418 return table[o->object.type];
419}
420
78519831 421int journal_file_move_to_object(JournalFile *f, ObjectType type, uint64_t offset, Object **ret) {
cec736d2
LP
422 int r;
423 void *t;
424 Object *o;
425 uint64_t s;
426
427 assert(f);
428 assert(ret);
429
db11ac1a
LP
430 /* Objects may only be located at multiple of 64 bit */
431 if (!VALID64(offset))
432 return -EFAULT;
433
7a9dabea 434 r = journal_file_move_to(f, type, false, offset, sizeof(ObjectHeader), &t);
cec736d2
LP
435 if (r < 0)
436 return r;
437
438 o = (Object*) t;
439 s = le64toh(o->object.size);
440
441 if (s < sizeof(ObjectHeader))
442 return -EBADMSG;
443
16e9f408
LP
444 if (o->object.type <= OBJECT_UNUSED)
445 return -EBADMSG;
446
447 if (s < minimum_header_size(o))
448 return -EBADMSG;
449
d05089d8 450 if (type > OBJECT_UNUSED && o->object.type != type)
cec736d2
LP
451 return -EBADMSG;
452
453 if (s > sizeof(ObjectHeader)) {
7a9dabea 454 r = journal_file_move_to(f, type, false, offset, s, &t);
cec736d2
LP
455 if (r < 0)
456 return r;
457
458 o = (Object*) t;
459 }
460
cec736d2
LP
461 *ret = o;
462 return 0;
463}
464
d98cc1f2 465static uint64_t journal_file_entry_seqnum(JournalFile *f, uint64_t *seqnum) {
cec736d2
LP
466 uint64_t r;
467
468 assert(f);
469
beec0085 470 r = le64toh(f->header->tail_entry_seqnum) + 1;
c2373f84
LP
471
472 if (seqnum) {
de190aef 473 /* If an external seqnum counter was passed, we update
c2373f84
LP
474 * both the local and the external one, and set it to
475 * the maximum of both */
476
477 if (*seqnum + 1 > r)
478 r = *seqnum + 1;
479
480 *seqnum = r;
481 }
482
beec0085 483 f->header->tail_entry_seqnum = htole64(r);
cec736d2 484
beec0085
LP
485 if (f->header->head_entry_seqnum == 0)
486 f->header->head_entry_seqnum = htole64(r);
de190aef 487
cec736d2
LP
488 return r;
489}
490
78519831 491int journal_file_append_object(JournalFile *f, ObjectType type, uint64_t size, Object **ret, uint64_t *offset) {
cec736d2
LP
492 int r;
493 uint64_t p;
494 Object *tail, *o;
495 void *t;
496
497 assert(f);
d05089d8 498 assert(type > OBJECT_UNUSED && type < _OBJECT_TYPE_MAX);
cec736d2
LP
499 assert(size >= sizeof(ObjectHeader));
500 assert(offset);
501 assert(ret);
502
26687bf8
OS
503 r = journal_file_set_online(f);
504 if (r < 0)
505 return r;
506
cec736d2 507 p = le64toh(f->header->tail_object_offset);
cec736d2 508 if (p == 0)
23b0b2b2 509 p = le64toh(f->header->header_size);
cec736d2 510 else {
d05089d8 511 r = journal_file_move_to_object(f, OBJECT_UNUSED, p, &tail);
cec736d2
LP
512 if (r < 0)
513 return r;
514
515 p += ALIGN64(le64toh(tail->object.size));
516 }
517
518 r = journal_file_allocate(f, p, size);
519 if (r < 0)
520 return r;
521
fcde2389 522 r = journal_file_move_to(f, type, false, p, size, &t);
cec736d2
LP
523 if (r < 0)
524 return r;
525
526 o = (Object*) t;
527
528 zero(o->object);
de190aef 529 o->object.type = type;
cec736d2
LP
530 o->object.size = htole64(size);
531
532 f->header->tail_object_offset = htole64(p);
cec736d2
LP
533 f->header->n_objects = htole64(le64toh(f->header->n_objects) + 1);
534
535 *ret = o;
536 *offset = p;
537
538 return 0;
539}
540
de190aef 541static int journal_file_setup_data_hash_table(JournalFile *f) {
cec736d2
LP
542 uint64_t s, p;
543 Object *o;
544 int r;
545
546 assert(f);
547
dfabe643 548 /* We estimate that we need 1 hash table entry per 768 of
4a92baf3
LP
549 journal file and we want to make sure we never get beyond
550 75% fill level. Calculate the hash table size for the
551 maximum file size based on these metrics. */
552
dfabe643 553 s = (f->metrics.max_size * 4 / 768 / 3) * sizeof(HashItem);
4a92baf3
LP
554 if (s < DEFAULT_DATA_HASH_TABLE_SIZE)
555 s = DEFAULT_DATA_HASH_TABLE_SIZE;
556
507f22bd 557 log_debug("Reserving %"PRIu64" entries in hash table.", s / sizeof(HashItem));
4a92baf3 558
de190aef
LP
559 r = journal_file_append_object(f,
560 OBJECT_DATA_HASH_TABLE,
561 offsetof(Object, hash_table.items) + s,
562 &o, &p);
cec736d2
LP
563 if (r < 0)
564 return r;
565
29804cc1 566 memzero(o->hash_table.items, s);
cec736d2 567
de190aef
LP
568 f->header->data_hash_table_offset = htole64(p + offsetof(Object, hash_table.items));
569 f->header->data_hash_table_size = htole64(s);
cec736d2
LP
570
571 return 0;
572}
573
de190aef 574static int journal_file_setup_field_hash_table(JournalFile *f) {
cec736d2
LP
575 uint64_t s, p;
576 Object *o;
577 int r;
578
579 assert(f);
580
3c1668da
LP
581 /* We use a fixed size hash table for the fields as this
582 * number should grow very slowly only */
583
de190aef
LP
584 s = DEFAULT_FIELD_HASH_TABLE_SIZE;
585 r = journal_file_append_object(f,
586 OBJECT_FIELD_HASH_TABLE,
587 offsetof(Object, hash_table.items) + s,
588 &o, &p);
cec736d2
LP
589 if (r < 0)
590 return r;
591
29804cc1 592 memzero(o->hash_table.items, s);
cec736d2 593
de190aef
LP
594 f->header->field_hash_table_offset = htole64(p + offsetof(Object, hash_table.items));
595 f->header->field_hash_table_size = htole64(s);
cec736d2
LP
596
597 return 0;
598}
599
de190aef 600static int journal_file_map_data_hash_table(JournalFile *f) {
cec736d2
LP
601 uint64_t s, p;
602 void *t;
603 int r;
604
605 assert(f);
606
de190aef
LP
607 p = le64toh(f->header->data_hash_table_offset);
608 s = le64toh(f->header->data_hash_table_size);
cec736d2 609
de190aef 610 r = journal_file_move_to(f,
16e9f408 611 OBJECT_DATA_HASH_TABLE,
fcde2389 612 true,
de190aef
LP
613 p, s,
614 &t);
cec736d2
LP
615 if (r < 0)
616 return r;
617
de190aef 618 f->data_hash_table = t;
cec736d2
LP
619 return 0;
620}
621
de190aef 622static int journal_file_map_field_hash_table(JournalFile *f) {
cec736d2
LP
623 uint64_t s, p;
624 void *t;
625 int r;
626
627 assert(f);
628
de190aef
LP
629 p = le64toh(f->header->field_hash_table_offset);
630 s = le64toh(f->header->field_hash_table_size);
cec736d2 631
de190aef 632 r = journal_file_move_to(f,
16e9f408 633 OBJECT_FIELD_HASH_TABLE,
fcde2389 634 true,
de190aef
LP
635 p, s,
636 &t);
cec736d2
LP
637 if (r < 0)
638 return r;
639
de190aef 640 f->field_hash_table = t;
cec736d2
LP
641 return 0;
642}
643
3c1668da
LP
644static int journal_file_link_field(
645 JournalFile *f,
646 Object *o,
647 uint64_t offset,
648 uint64_t hash) {
649
650 uint64_t p, h;
651 int r;
652
653 assert(f);
654 assert(o);
655 assert(offset > 0);
656
657 if (o->object.type != OBJECT_FIELD)
658 return -EINVAL;
659
660 /* This might alter the window we are looking at */
661
662 o->field.next_hash_offset = o->field.head_data_offset = 0;
663
664 h = hash % (le64toh(f->header->field_hash_table_size) / sizeof(HashItem));
665 p = le64toh(f->field_hash_table[h].tail_hash_offset);
666 if (p == 0)
667 f->field_hash_table[h].head_hash_offset = htole64(offset);
668 else {
669 r = journal_file_move_to_object(f, OBJECT_FIELD, p, &o);
670 if (r < 0)
671 return r;
672
673 o->field.next_hash_offset = htole64(offset);
674 }
675
676 f->field_hash_table[h].tail_hash_offset = htole64(offset);
677
678 if (JOURNAL_HEADER_CONTAINS(f->header, n_fields))
679 f->header->n_fields = htole64(le64toh(f->header->n_fields) + 1);
680
681 return 0;
682}
683
684static int journal_file_link_data(
685 JournalFile *f,
686 Object *o,
687 uint64_t offset,
688 uint64_t hash) {
689
de190aef 690 uint64_t p, h;
cec736d2
LP
691 int r;
692
693 assert(f);
694 assert(o);
695 assert(offset > 0);
b588975f
LP
696
697 if (o->object.type != OBJECT_DATA)
698 return -EINVAL;
cec736d2 699
48496df6
LP
700 /* This might alter the window we are looking at */
701
de190aef
LP
702 o->data.next_hash_offset = o->data.next_field_offset = 0;
703 o->data.entry_offset = o->data.entry_array_offset = 0;
704 o->data.n_entries = 0;
cec736d2 705
de190aef 706 h = hash % (le64toh(f->header->data_hash_table_size) / sizeof(HashItem));
8db4213e 707 p = le64toh(f->data_hash_table[h].tail_hash_offset);
3c1668da 708 if (p == 0)
cec736d2 709 /* Only entry in the hash table is easy */
de190aef 710 f->data_hash_table[h].head_hash_offset = htole64(offset);
3c1668da 711 else {
48496df6
LP
712 /* Move back to the previous data object, to patch in
713 * pointer */
cec736d2 714
de190aef 715 r = journal_file_move_to_object(f, OBJECT_DATA, p, &o);
cec736d2
LP
716 if (r < 0)
717 return r;
718
de190aef 719 o->data.next_hash_offset = htole64(offset);
cec736d2
LP
720 }
721
de190aef 722 f->data_hash_table[h].tail_hash_offset = htole64(offset);
cec736d2 723
dca6219e
LP
724 if (JOURNAL_HEADER_CONTAINS(f->header, n_data))
725 f->header->n_data = htole64(le64toh(f->header->n_data) + 1);
726
cec736d2
LP
727 return 0;
728}
729
3c1668da
LP
730int journal_file_find_field_object_with_hash(
731 JournalFile *f,
732 const void *field, uint64_t size, uint64_t hash,
733 Object **ret, uint64_t *offset) {
734
735 uint64_t p, osize, h;
736 int r;
737
738 assert(f);
739 assert(field && size > 0);
740
741 osize = offsetof(Object, field.payload) + size;
742
743 if (f->header->field_hash_table_size == 0)
744 return -EBADMSG;
745
746 h = hash % (le64toh(f->header->field_hash_table_size) / sizeof(HashItem));
747 p = le64toh(f->field_hash_table[h].head_hash_offset);
748
749 while (p > 0) {
750 Object *o;
751
752 r = journal_file_move_to_object(f, OBJECT_FIELD, p, &o);
753 if (r < 0)
754 return r;
755
756 if (le64toh(o->field.hash) == hash &&
757 le64toh(o->object.size) == osize &&
758 memcmp(o->field.payload, field, size) == 0) {
759
760 if (ret)
761 *ret = o;
762 if (offset)
763 *offset = p;
764
765 return 1;
766 }
767
768 p = le64toh(o->field.next_hash_offset);
769 }
770
771 return 0;
772}
773
774int journal_file_find_field_object(
775 JournalFile *f,
776 const void *field, uint64_t size,
777 Object **ret, uint64_t *offset) {
778
779 uint64_t hash;
780
781 assert(f);
782 assert(field && size > 0);
783
784 hash = hash64(field, size);
785
786 return journal_file_find_field_object_with_hash(f,
787 field, size, hash,
788 ret, offset);
789}
790
de190aef
LP
791int journal_file_find_data_object_with_hash(
792 JournalFile *f,
793 const void *data, uint64_t size, uint64_t hash,
794 Object **ret, uint64_t *offset) {
48496df6 795
de190aef 796 uint64_t p, osize, h;
cec736d2
LP
797 int r;
798
799 assert(f);
800 assert(data || size == 0);
801
802 osize = offsetof(Object, data.payload) + size;
803
bc85bfee
LP
804 if (f->header->data_hash_table_size == 0)
805 return -EBADMSG;
806
de190aef
LP
807 h = hash % (le64toh(f->header->data_hash_table_size) / sizeof(HashItem));
808 p = le64toh(f->data_hash_table[h].head_hash_offset);
cec736d2 809
de190aef
LP
810 while (p > 0) {
811 Object *o;
cec736d2 812
de190aef 813 r = journal_file_move_to_object(f, OBJECT_DATA, p, &o);
cec736d2
LP
814 if (r < 0)
815 return r;
816
807e17f0 817 if (le64toh(o->data.hash) != hash)
85a131e8 818 goto next;
807e17f0 819
d89c8fdf 820 if (o->object.flags & OBJECT_COMPRESSION_MASK) {
3b1a55e1 821#if defined(HAVE_XZ) || defined(HAVE_LZ4)
fa1c4b51
ZJS
822 uint64_t l;
823 size_t rsize;
cec736d2 824
807e17f0
LP
825 l = le64toh(o->object.size);
826 if (l <= offsetof(Object, data.payload))
cec736d2
LP
827 return -EBADMSG;
828
807e17f0
LP
829 l -= offsetof(Object, data.payload);
830
d89c8fdf
ZJS
831 r = decompress_blob(o->object.flags & OBJECT_COMPRESSION_MASK,
832 o->data.payload, l, &f->compress_buffer, &f->compress_buffer_size, &rsize, 0);
833 if (r < 0)
834 return r;
807e17f0 835
b785c858 836 if (rsize == size &&
807e17f0
LP
837 memcmp(f->compress_buffer, data, size) == 0) {
838
839 if (ret)
840 *ret = o;
841
842 if (offset)
843 *offset = p;
844
845 return 1;
846 }
3b1a55e1
ZJS
847#else
848 return -EPROTONOSUPPORT;
849#endif
807e17f0
LP
850 } else if (le64toh(o->object.size) == osize &&
851 memcmp(o->data.payload, data, size) == 0) {
852
cec736d2
LP
853 if (ret)
854 *ret = o;
855
856 if (offset)
857 *offset = p;
858
de190aef 859 return 1;
cec736d2
LP
860 }
861
85a131e8 862 next:
cec736d2
LP
863 p = le64toh(o->data.next_hash_offset);
864 }
865
de190aef
LP
866 return 0;
867}
868
869int journal_file_find_data_object(
870 JournalFile *f,
871 const void *data, uint64_t size,
872 Object **ret, uint64_t *offset) {
873
874 uint64_t hash;
875
876 assert(f);
877 assert(data || size == 0);
878
879 hash = hash64(data, size);
880
881 return journal_file_find_data_object_with_hash(f,
882 data, size, hash,
883 ret, offset);
884}
885
3c1668da
LP
886static int journal_file_append_field(
887 JournalFile *f,
888 const void *field, uint64_t size,
889 Object **ret, uint64_t *offset) {
890
891 uint64_t hash, p;
892 uint64_t osize;
893 Object *o;
894 int r;
895
896 assert(f);
897 assert(field && size > 0);
898
899 hash = hash64(field, size);
900
901 r = journal_file_find_field_object_with_hash(f, field, size, hash, &o, &p);
902 if (r < 0)
903 return r;
904 else if (r > 0) {
905
906 if (ret)
907 *ret = o;
908
909 if (offset)
910 *offset = p;
911
912 return 0;
913 }
914
915 osize = offsetof(Object, field.payload) + size;
916 r = journal_file_append_object(f, OBJECT_FIELD, osize, &o, &p);
8c92d4bb
LP
917 if (r < 0)
918 return r;
3c1668da
LP
919
920 o->field.hash = htole64(hash);
921 memcpy(o->field.payload, field, size);
922
923 r = journal_file_link_field(f, o, p, hash);
924 if (r < 0)
925 return r;
926
927 /* The linking might have altered the window, so let's
928 * refresh our pointer */
929 r = journal_file_move_to_object(f, OBJECT_FIELD, p, &o);
930 if (r < 0)
931 return r;
932
933#ifdef HAVE_GCRYPT
934 r = journal_file_hmac_put_object(f, OBJECT_FIELD, o, p);
935 if (r < 0)
936 return r;
937#endif
938
939 if (ret)
940 *ret = o;
941
942 if (offset)
943 *offset = p;
944
945 return 0;
946}
947
48496df6
LP
948static int journal_file_append_data(
949 JournalFile *f,
950 const void *data, uint64_t size,
951 Object **ret, uint64_t *offset) {
952
de190aef
LP
953 uint64_t hash, p;
954 uint64_t osize;
955 Object *o;
d89c8fdf 956 int r, compression = 0;
3c1668da 957 const void *eq;
de190aef
LP
958
959 assert(f);
960 assert(data || size == 0);
961
962 hash = hash64(data, size);
963
964 r = journal_file_find_data_object_with_hash(f, data, size, hash, &o, &p);
965 if (r < 0)
966 return r;
967 else if (r > 0) {
968
969 if (ret)
970 *ret = o;
971
972 if (offset)
973 *offset = p;
974
975 return 0;
976 }
977
978 osize = offsetof(Object, data.payload) + size;
979 r = journal_file_append_object(f, OBJECT_DATA, osize, &o, &p);
cec736d2
LP
980 if (r < 0)
981 return r;
982
cec736d2 983 o->data.hash = htole64(hash);
807e17f0 984
d89c8fdf
ZJS
985#if defined(HAVE_XZ) || defined(HAVE_LZ4)
986 if (f->compress_xz &&
807e17f0 987 size >= COMPRESSION_SIZE_THRESHOLD) {
fa1c4b51 988 size_t rsize;
807e17f0 989
d89c8fdf 990 compression = compress_blob(data, size, o->data.payload, &rsize);
807e17f0 991
d89c8fdf 992 if (compression) {
807e17f0 993 o->object.size = htole64(offsetof(Object, data.payload) + rsize);
d89c8fdf 994 o->object.flags |= compression;
807e17f0 995
fa1c4b51 996 log_debug("Compressed data object %"PRIu64" -> %zu using %s",
d89c8fdf 997 size, rsize, object_compressed_to_string(compression));
807e17f0
LP
998 }
999 }
1000#endif
1001
d89c8fdf 1002 if (!compression && size > 0)
807e17f0 1003 memcpy(o->data.payload, data, size);
cec736d2 1004
de190aef 1005 r = journal_file_link_data(f, o, p, hash);
cec736d2
LP
1006 if (r < 0)
1007 return r;
1008
48496df6
LP
1009 /* The linking might have altered the window, so let's
1010 * refresh our pointer */
1011 r = journal_file_move_to_object(f, OBJECT_DATA, p, &o);
1012 if (r < 0)
1013 return r;
1014
08c6f819
SL
1015 if (!data)
1016 eq = NULL;
1017 else
1018 eq = memchr(data, '=', size);
3c1668da 1019 if (eq && eq > data) {
748db592 1020 Object *fo = NULL;
3c1668da 1021 uint64_t fp;
3c1668da
LP
1022
1023 /* Create field object ... */
1024 r = journal_file_append_field(f, data, (uint8_t*) eq - (uint8_t*) data, &fo, &fp);
1025 if (r < 0)
1026 return r;
1027
1028 /* ... and link it in. */
1029 o->data.next_field_offset = fo->field.head_data_offset;
1030 fo->field.head_data_offset = le64toh(p);
1031 }
1032
5996c7c2
LP
1033#ifdef HAVE_GCRYPT
1034 r = journal_file_hmac_put_object(f, OBJECT_DATA, o, p);
1035 if (r < 0)
1036 return r;
1037#endif
1038
cec736d2
LP
1039 if (ret)
1040 *ret = o;
1041
1042 if (offset)
de190aef 1043 *offset = p;
cec736d2
LP
1044
1045 return 0;
1046}
1047
1048uint64_t journal_file_entry_n_items(Object *o) {
1049 assert(o);
b588975f
LP
1050
1051 if (o->object.type != OBJECT_ENTRY)
1052 return 0;
cec736d2
LP
1053
1054 return (le64toh(o->object.size) - offsetof(Object, entry.items)) / sizeof(EntryItem);
1055}
1056
0284adc6 1057uint64_t journal_file_entry_array_n_items(Object *o) {
de190aef 1058 assert(o);
b588975f
LP
1059
1060 if (o->object.type != OBJECT_ENTRY_ARRAY)
1061 return 0;
de190aef
LP
1062
1063 return (le64toh(o->object.size) - offsetof(Object, entry_array.items)) / sizeof(uint64_t);
1064}
1065
fb9a24b6
LP
1066uint64_t journal_file_hash_table_n_items(Object *o) {
1067 assert(o);
b588975f
LP
1068
1069 if (o->object.type != OBJECT_DATA_HASH_TABLE &&
1070 o->object.type != OBJECT_FIELD_HASH_TABLE)
1071 return 0;
fb9a24b6
LP
1072
1073 return (le64toh(o->object.size) - offsetof(Object, hash_table.items)) / sizeof(HashItem);
1074}
1075
de190aef 1076static int link_entry_into_array(JournalFile *f,
4fd052ae
FC
1077 le64_t *first,
1078 le64_t *idx,
de190aef 1079 uint64_t p) {
cec736d2 1080 int r;
de190aef
LP
1081 uint64_t n = 0, ap = 0, q, i, a, hidx;
1082 Object *o;
1083
cec736d2 1084 assert(f);
de190aef
LP
1085 assert(first);
1086 assert(idx);
1087 assert(p > 0);
cec736d2 1088
de190aef
LP
1089 a = le64toh(*first);
1090 i = hidx = le64toh(*idx);
1091 while (a > 0) {
1092
1093 r = journal_file_move_to_object(f, OBJECT_ENTRY_ARRAY, a, &o);
1094 if (r < 0)
1095 return r;
cec736d2 1096
de190aef
LP
1097 n = journal_file_entry_array_n_items(o);
1098 if (i < n) {
1099 o->entry_array.items[i] = htole64(p);
1100 *idx = htole64(hidx + 1);
1101 return 0;
1102 }
cec736d2 1103
de190aef
LP
1104 i -= n;
1105 ap = a;
1106 a = le64toh(o->entry_array.next_entry_array_offset);
1107 }
1108
1109 if (hidx > n)
1110 n = (hidx+1) * 2;
1111 else
1112 n = n * 2;
1113
1114 if (n < 4)
1115 n = 4;
1116
1117 r = journal_file_append_object(f, OBJECT_ENTRY_ARRAY,
1118 offsetof(Object, entry_array.items) + n * sizeof(uint64_t),
1119 &o, &q);
cec736d2
LP
1120 if (r < 0)
1121 return r;
1122
feb12d3e 1123#ifdef HAVE_GCRYPT
5996c7c2 1124 r = journal_file_hmac_put_object(f, OBJECT_ENTRY_ARRAY, o, q);
b0af6f41
LP
1125 if (r < 0)
1126 return r;
feb12d3e 1127#endif
b0af6f41 1128
de190aef 1129 o->entry_array.items[i] = htole64(p);
cec736d2 1130
de190aef 1131 if (ap == 0)
7be3aa17 1132 *first = htole64(q);
cec736d2 1133 else {
de190aef 1134 r = journal_file_move_to_object(f, OBJECT_ENTRY_ARRAY, ap, &o);
cec736d2
LP
1135 if (r < 0)
1136 return r;
1137
de190aef
LP
1138 o->entry_array.next_entry_array_offset = htole64(q);
1139 }
cec736d2 1140
2dee23eb
LP
1141 if (JOURNAL_HEADER_CONTAINS(f->header, n_entry_arrays))
1142 f->header->n_entry_arrays = htole64(le64toh(f->header->n_entry_arrays) + 1);
1143
de190aef
LP
1144 *idx = htole64(hidx + 1);
1145
1146 return 0;
1147}
cec736d2 1148
de190aef 1149static int link_entry_into_array_plus_one(JournalFile *f,
4fd052ae
FC
1150 le64_t *extra,
1151 le64_t *first,
1152 le64_t *idx,
de190aef
LP
1153 uint64_t p) {
1154
1155 int r;
1156
1157 assert(f);
1158 assert(extra);
1159 assert(first);
1160 assert(idx);
1161 assert(p > 0);
1162
1163 if (*idx == 0)
1164 *extra = htole64(p);
1165 else {
4fd052ae 1166 le64_t i;
de190aef 1167
7be3aa17 1168 i = htole64(le64toh(*idx) - 1);
de190aef
LP
1169 r = link_entry_into_array(f, first, &i, p);
1170 if (r < 0)
1171 return r;
cec736d2
LP
1172 }
1173
de190aef
LP
1174 *idx = htole64(le64toh(*idx) + 1);
1175 return 0;
1176}
1177
1178static int journal_file_link_entry_item(JournalFile *f, Object *o, uint64_t offset, uint64_t i) {
1179 uint64_t p;
1180 int r;
1181 assert(f);
1182 assert(o);
1183 assert(offset > 0);
1184
1185 p = le64toh(o->entry.items[i].object_offset);
1186 if (p == 0)
1187 return -EINVAL;
1188
1189 r = journal_file_move_to_object(f, OBJECT_DATA, p, &o);
cec736d2
LP
1190 if (r < 0)
1191 return r;
1192
de190aef
LP
1193 return link_entry_into_array_plus_one(f,
1194 &o->data.entry_offset,
1195 &o->data.entry_array_offset,
1196 &o->data.n_entries,
1197 offset);
cec736d2
LP
1198}
1199
1200static int journal_file_link_entry(JournalFile *f, Object *o, uint64_t offset) {
de190aef 1201 uint64_t n, i;
cec736d2
LP
1202 int r;
1203
1204 assert(f);
1205 assert(o);
1206 assert(offset > 0);
b588975f
LP
1207
1208 if (o->object.type != OBJECT_ENTRY)
1209 return -EINVAL;
cec736d2 1210
b788cc23
LP
1211 __sync_synchronize();
1212
cec736d2 1213 /* Link up the entry itself */
de190aef
LP
1214 r = link_entry_into_array(f,
1215 &f->header->entry_array_offset,
1216 &f->header->n_entries,
1217 offset);
1218 if (r < 0)
1219 return r;
cec736d2 1220
507f22bd 1221 /* log_debug("=> %s seqnr=%"PRIu64" n_entries=%"PRIu64, f->path, o->entry.seqnum, f->header->n_entries); */
cec736d2 1222
de190aef 1223 if (f->header->head_entry_realtime == 0)
0ac38b70 1224 f->header->head_entry_realtime = o->entry.realtime;
cec736d2 1225
0ac38b70 1226 f->header->tail_entry_realtime = o->entry.realtime;
de190aef
LP
1227 f->header->tail_entry_monotonic = o->entry.monotonic;
1228
1229 f->tail_entry_monotonic_valid = true;
cec736d2
LP
1230
1231 /* Link up the items */
1232 n = journal_file_entry_n_items(o);
1233 for (i = 0; i < n; i++) {
1234 r = journal_file_link_entry_item(f, o, offset, i);
1235 if (r < 0)
1236 return r;
1237 }
1238
cec736d2
LP
1239 return 0;
1240}
1241
1242static int journal_file_append_entry_internal(
1243 JournalFile *f,
1244 const dual_timestamp *ts,
1245 uint64_t xor_hash,
1246 const EntryItem items[], unsigned n_items,
de190aef 1247 uint64_t *seqnum,
cec736d2
LP
1248 Object **ret, uint64_t *offset) {
1249 uint64_t np;
1250 uint64_t osize;
1251 Object *o;
1252 int r;
1253
1254 assert(f);
1255 assert(items || n_items == 0);
de190aef 1256 assert(ts);
cec736d2
LP
1257
1258 osize = offsetof(Object, entry.items) + (n_items * sizeof(EntryItem));
1259
de190aef 1260 r = journal_file_append_object(f, OBJECT_ENTRY, osize, &o, &np);
cec736d2
LP
1261 if (r < 0)
1262 return r;
1263
d98cc1f2 1264 o->entry.seqnum = htole64(journal_file_entry_seqnum(f, seqnum));
cec736d2 1265 memcpy(o->entry.items, items, n_items * sizeof(EntryItem));
de190aef
LP
1266 o->entry.realtime = htole64(ts->realtime);
1267 o->entry.monotonic = htole64(ts->monotonic);
cec736d2
LP
1268 o->entry.xor_hash = htole64(xor_hash);
1269 o->entry.boot_id = f->header->boot_id;
1270
feb12d3e 1271#ifdef HAVE_GCRYPT
5996c7c2 1272 r = journal_file_hmac_put_object(f, OBJECT_ENTRY, o, np);
b0af6f41
LP
1273 if (r < 0)
1274 return r;
feb12d3e 1275#endif
b0af6f41 1276
cec736d2
LP
1277 r = journal_file_link_entry(f, o, np);
1278 if (r < 0)
1279 return r;
1280
1281 if (ret)
1282 *ret = o;
1283
1284 if (offset)
1285 *offset = np;
1286
1287 return 0;
1288}
1289
cf244689 1290void journal_file_post_change(JournalFile *f) {
50f20cfd
LP
1291 assert(f);
1292
1293 /* inotify() does not receive IN_MODIFY events from file
1294 * accesses done via mmap(). After each access we hence
1295 * trigger IN_MODIFY by truncating the journal file to its
1296 * current size which triggers IN_MODIFY. */
1297
bc85bfee
LP
1298 __sync_synchronize();
1299
50f20cfd 1300 if (ftruncate(f->fd, f->last_stat.st_size) < 0)
56f64d95 1301 log_error_errno(errno, "Failed to truncate file to its own size: %m");
50f20cfd
LP
1302}
1303
1f2da9ec
LP
1304static int entry_item_cmp(const void *_a, const void *_b) {
1305 const EntryItem *a = _a, *b = _b;
1306
1307 if (le64toh(a->object_offset) < le64toh(b->object_offset))
1308 return -1;
1309 if (le64toh(a->object_offset) > le64toh(b->object_offset))
1310 return 1;
1311 return 0;
1312}
1313
de190aef 1314int journal_file_append_entry(JournalFile *f, const dual_timestamp *ts, const struct iovec iovec[], unsigned n_iovec, uint64_t *seqnum, Object **ret, uint64_t *offset) {
cec736d2
LP
1315 unsigned i;
1316 EntryItem *items;
1317 int r;
1318 uint64_t xor_hash = 0;
de190aef 1319 struct dual_timestamp _ts;
cec736d2
LP
1320
1321 assert(f);
1322 assert(iovec || n_iovec == 0);
1323
de190aef
LP
1324 if (!ts) {
1325 dual_timestamp_get(&_ts);
1326 ts = &_ts;
1327 }
1328
1329 if (f->tail_entry_monotonic_valid &&
1330 ts->monotonic < le64toh(f->header->tail_entry_monotonic))
1331 return -EINVAL;
1332
feb12d3e 1333#ifdef HAVE_GCRYPT
7560fffc
LP
1334 r = journal_file_maybe_append_tag(f, ts->realtime);
1335 if (r < 0)
1336 return r;
feb12d3e 1337#endif
7560fffc 1338
64825d3c 1339 /* alloca() can't take 0, hence let's allocate at least one */
9607d947 1340 items = alloca(sizeof(EntryItem) * MAX(1u, n_iovec));
cec736d2
LP
1341
1342 for (i = 0; i < n_iovec; i++) {
1343 uint64_t p;
1344 Object *o;
1345
1346 r = journal_file_append_data(f, iovec[i].iov_base, iovec[i].iov_len, &o, &p);
1347 if (r < 0)
cf244689 1348 return r;
cec736d2
LP
1349
1350 xor_hash ^= le64toh(o->data.hash);
1351 items[i].object_offset = htole64(p);
de7b95cd 1352 items[i].hash = o->data.hash;
cec736d2
LP
1353 }
1354
1f2da9ec
LP
1355 /* Order by the position on disk, in order to improve seek
1356 * times for rotating media. */
7ff7394d 1357 qsort_safe(items, n_iovec, sizeof(EntryItem), entry_item_cmp);
1f2da9ec 1358
de190aef 1359 r = journal_file_append_entry_internal(f, ts, xor_hash, items, n_iovec, seqnum, ret, offset);
cec736d2 1360
50f20cfd
LP
1361 journal_file_post_change(f);
1362
cec736d2
LP
1363 return r;
1364}
1365
a4bcff5b 1366typedef struct ChainCacheItem {
fb099c8d 1367 uint64_t first; /* the array at the beginning of the chain */
a4bcff5b
LP
1368 uint64_t array; /* the cached array */
1369 uint64_t begin; /* the first item in the cached array */
1370 uint64_t total; /* the total number of items in all arrays before this one in the chain */
f268980d 1371 uint64_t last_index; /* the last index we looked at, to optimize locality when bisecting */
a4bcff5b
LP
1372} ChainCacheItem;
1373
1374static void chain_cache_put(
4743015d 1375 OrderedHashmap *h,
a4bcff5b
LP
1376 ChainCacheItem *ci,
1377 uint64_t first,
1378 uint64_t array,
1379 uint64_t begin,
f268980d
LP
1380 uint64_t total,
1381 uint64_t last_index) {
a4bcff5b
LP
1382
1383 if (!ci) {
34741aa3
LP
1384 /* If the chain item to cache for this chain is the
1385 * first one it's not worth caching anything */
1386 if (array == first)
1387 return;
1388
29433089 1389 if (ordered_hashmap_size(h) >= CHAIN_CACHE_MAX) {
4743015d 1390 ci = ordered_hashmap_steal_first(h);
29433089
LP
1391 assert(ci);
1392 } else {
a4bcff5b
LP
1393 ci = new(ChainCacheItem, 1);
1394 if (!ci)
1395 return;
1396 }
1397
1398 ci->first = first;
1399
4743015d 1400 if (ordered_hashmap_put(h, &ci->first, ci) < 0) {
a4bcff5b
LP
1401 free(ci);
1402 return;
1403 }
1404 } else
1405 assert(ci->first == first);
1406
1407 ci->array = array;
1408 ci->begin = begin;
1409 ci->total = total;
f268980d 1410 ci->last_index = last_index;
a4bcff5b
LP
1411}
1412
f268980d
LP
1413static int generic_array_get(
1414 JournalFile *f,
1415 uint64_t first,
1416 uint64_t i,
1417 Object **ret, uint64_t *offset) {
de190aef 1418
cec736d2 1419 Object *o;
a4bcff5b 1420 uint64_t p = 0, a, t = 0;
cec736d2 1421 int r;
a4bcff5b 1422 ChainCacheItem *ci;
cec736d2
LP
1423
1424 assert(f);
1425
de190aef 1426 a = first;
a4bcff5b
LP
1427
1428 /* Try the chain cache first */
4743015d 1429 ci = ordered_hashmap_get(f->chain_cache, &first);
a4bcff5b
LP
1430 if (ci && i > ci->total) {
1431 a = ci->array;
1432 i -= ci->total;
1433 t = ci->total;
1434 }
1435
de190aef 1436 while (a > 0) {
a4bcff5b 1437 uint64_t k;
cec736d2 1438
de190aef
LP
1439 r = journal_file_move_to_object(f, OBJECT_ENTRY_ARRAY, a, &o);
1440 if (r < 0)
1441 return r;
cec736d2 1442
a4bcff5b
LP
1443 k = journal_file_entry_array_n_items(o);
1444 if (i < k) {
de190aef 1445 p = le64toh(o->entry_array.items[i]);
a4bcff5b 1446 goto found;
cec736d2
LP
1447 }
1448
a4bcff5b
LP
1449 i -= k;
1450 t += k;
de190aef
LP
1451 a = le64toh(o->entry_array.next_entry_array_offset);
1452 }
1453
a4bcff5b
LP
1454 return 0;
1455
1456found:
1457 /* Let's cache this item for the next invocation */
af13a6b0 1458 chain_cache_put(f->chain_cache, ci, first, a, le64toh(o->entry_array.items[0]), t, i);
de190aef
LP
1459
1460 r = journal_file_move_to_object(f, OBJECT_ENTRY, p, &o);
1461 if (r < 0)
1462 return r;
1463
1464 if (ret)
1465 *ret = o;
1466
1467 if (offset)
1468 *offset = p;
1469
1470 return 1;
1471}
1472
f268980d
LP
1473static int generic_array_get_plus_one(
1474 JournalFile *f,
1475 uint64_t extra,
1476 uint64_t first,
1477 uint64_t i,
1478 Object **ret, uint64_t *offset) {
de190aef
LP
1479
1480 Object *o;
1481
1482 assert(f);
1483
1484 if (i == 0) {
1485 int r;
1486
1487 r = journal_file_move_to_object(f, OBJECT_ENTRY, extra, &o);
cec736d2
LP
1488 if (r < 0)
1489 return r;
1490
de190aef
LP
1491 if (ret)
1492 *ret = o;
cec736d2 1493
de190aef
LP
1494 if (offset)
1495 *offset = extra;
cec736d2 1496
de190aef 1497 return 1;
cec736d2
LP
1498 }
1499
de190aef
LP
1500 return generic_array_get(f, first, i-1, ret, offset);
1501}
cec736d2 1502
de190aef
LP
1503enum {
1504 TEST_FOUND,
1505 TEST_LEFT,
1506 TEST_RIGHT
1507};
cec736d2 1508
f268980d
LP
1509static int generic_array_bisect(
1510 JournalFile *f,
1511 uint64_t first,
1512 uint64_t n,
1513 uint64_t needle,
1514 int (*test_object)(JournalFile *f, uint64_t p, uint64_t needle),
1515 direction_t direction,
1516 Object **ret,
1517 uint64_t *offset,
1518 uint64_t *idx) {
1519
1520 uint64_t a, p, t = 0, i = 0, last_p = 0, last_index = (uint64_t) -1;
de190aef
LP
1521 bool subtract_one = false;
1522 Object *o, *array = NULL;
1523 int r;
a4bcff5b 1524 ChainCacheItem *ci;
cec736d2 1525
de190aef
LP
1526 assert(f);
1527 assert(test_object);
cec736d2 1528
a4bcff5b 1529 /* Start with the first array in the chain */
de190aef 1530 a = first;
a4bcff5b 1531
4743015d 1532 ci = ordered_hashmap_get(f->chain_cache, &first);
a4bcff5b
LP
1533 if (ci && n > ci->total) {
1534 /* Ah, we have iterated this bisection array chain
1535 * previously! Let's see if we can skip ahead in the
1536 * chain, as far as the last time. But we can't jump
1537 * backwards in the chain, so let's check that
1538 * first. */
1539
1540 r = test_object(f, ci->begin, needle);
1541 if (r < 0)
1542 return r;
1543
1544 if (r == TEST_LEFT) {
f268980d 1545 /* OK, what we are looking for is right of the
a4bcff5b
LP
1546 * begin of this EntryArray, so let's jump
1547 * straight to previously cached array in the
1548 * chain */
1549
1550 a = ci->array;
1551 n -= ci->total;
1552 t = ci->total;
f268980d 1553 last_index = ci->last_index;
a4bcff5b
LP
1554 }
1555 }
1556
de190aef
LP
1557 while (a > 0) {
1558 uint64_t left, right, k, lp;
1559
1560 r = journal_file_move_to_object(f, OBJECT_ENTRY_ARRAY, a, &array);
cec736d2
LP
1561 if (r < 0)
1562 return r;
1563
de190aef
LP
1564 k = journal_file_entry_array_n_items(array);
1565 right = MIN(k, n);
1566 if (right <= 0)
1567 return 0;
cec736d2 1568
de190aef
LP
1569 i = right - 1;
1570 lp = p = le64toh(array->entry_array.items[i]);
1571 if (p <= 0)
1572 return -EBADMSG;
cec736d2 1573
de190aef
LP
1574 r = test_object(f, p, needle);
1575 if (r < 0)
1576 return r;
cec736d2 1577
de190aef
LP
1578 if (r == TEST_FOUND)
1579 r = direction == DIRECTION_DOWN ? TEST_RIGHT : TEST_LEFT;
1580
1581 if (r == TEST_RIGHT) {
1582 left = 0;
1583 right -= 1;
f268980d
LP
1584
1585 if (last_index != (uint64_t) -1) {
1586 assert(last_index <= right);
1587
1588 /* If we cached the last index we
1589 * looked at, let's try to not to jump
1590 * too wildly around and see if we can
1591 * limit the range to look at early to
1592 * the immediate neighbors of the last
1593 * index we looked at. */
1594
1595 if (last_index > 0) {
1596 uint64_t x = last_index - 1;
1597
1598 p = le64toh(array->entry_array.items[x]);
1599 if (p <= 0)
1600 return -EBADMSG;
1601
1602 r = test_object(f, p, needle);
1603 if (r < 0)
1604 return r;
1605
1606 if (r == TEST_FOUND)
1607 r = direction == DIRECTION_DOWN ? TEST_RIGHT : TEST_LEFT;
1608
1609 if (r == TEST_RIGHT)
1610 right = x;
1611 else
1612 left = x + 1;
1613 }
1614
1615 if (last_index < right) {
1616 uint64_t y = last_index + 1;
1617
1618 p = le64toh(array->entry_array.items[y]);
1619 if (p <= 0)
1620 return -EBADMSG;
1621
1622 r = test_object(f, p, needle);
1623 if (r < 0)
1624 return r;
1625
1626 if (r == TEST_FOUND)
1627 r = direction == DIRECTION_DOWN ? TEST_RIGHT : TEST_LEFT;
1628
1629 if (r == TEST_RIGHT)
1630 right = y;
1631 else
1632 left = y + 1;
1633 }
f268980d
LP
1634 }
1635
de190aef
LP
1636 for (;;) {
1637 if (left == right) {
1638 if (direction == DIRECTION_UP)
1639 subtract_one = true;
1640
1641 i = left;
1642 goto found;
1643 }
1644
1645 assert(left < right);
de190aef 1646 i = (left + right) / 2;
f268980d 1647
de190aef
LP
1648 p = le64toh(array->entry_array.items[i]);
1649 if (p <= 0)
1650 return -EBADMSG;
1651
1652 r = test_object(f, p, needle);
1653 if (r < 0)
1654 return r;
cec736d2 1655
de190aef
LP
1656 if (r == TEST_FOUND)
1657 r = direction == DIRECTION_DOWN ? TEST_RIGHT : TEST_LEFT;
1658
1659 if (r == TEST_RIGHT)
1660 right = i;
1661 else
1662 left = i + 1;
1663 }
1664 }
1665
2173cbf8 1666 if (k >= n) {
cbdca852
LP
1667 if (direction == DIRECTION_UP) {
1668 i = n;
1669 subtract_one = true;
1670 goto found;
1671 }
1672
cec736d2 1673 return 0;
cbdca852 1674 }
cec736d2 1675
de190aef
LP
1676 last_p = lp;
1677
1678 n -= k;
1679 t += k;
f268980d 1680 last_index = (uint64_t) -1;
de190aef 1681 a = le64toh(array->entry_array.next_entry_array_offset);
cec736d2
LP
1682 }
1683
1684 return 0;
de190aef
LP
1685
1686found:
1687 if (subtract_one && t == 0 && i == 0)
1688 return 0;
1689
a4bcff5b 1690 /* Let's cache this item for the next invocation */
af13a6b0 1691 chain_cache_put(f->chain_cache, ci, first, a, le64toh(array->entry_array.items[0]), t, subtract_one ? (i > 0 ? i-1 : (uint64_t) -1) : i);
a4bcff5b 1692
de190aef
LP
1693 if (subtract_one && i == 0)
1694 p = last_p;
1695 else if (subtract_one)
1696 p = le64toh(array->entry_array.items[i-1]);
1697 else
1698 p = le64toh(array->entry_array.items[i]);
1699
1700 r = journal_file_move_to_object(f, OBJECT_ENTRY, p, &o);
1701 if (r < 0)
1702 return r;
1703
1704 if (ret)
1705 *ret = o;
1706
1707 if (offset)
1708 *offset = p;
1709
1710 if (idx)
cbdca852 1711 *idx = t + i + (subtract_one ? -1 : 0);
de190aef
LP
1712
1713 return 1;
cec736d2
LP
1714}
1715
f268980d
LP
1716
1717static int generic_array_bisect_plus_one(
1718 JournalFile *f,
1719 uint64_t extra,
1720 uint64_t first,
1721 uint64_t n,
1722 uint64_t needle,
1723 int (*test_object)(JournalFile *f, uint64_t p, uint64_t needle),
1724 direction_t direction,
1725 Object **ret,
1726 uint64_t *offset,
1727 uint64_t *idx) {
de190aef 1728
cec736d2 1729 int r;
cbdca852
LP
1730 bool step_back = false;
1731 Object *o;
cec736d2
LP
1732
1733 assert(f);
de190aef 1734 assert(test_object);
cec736d2 1735
de190aef
LP
1736 if (n <= 0)
1737 return 0;
cec736d2 1738
de190aef
LP
1739 /* This bisects the array in object 'first', but first checks
1740 * an extra */
de190aef
LP
1741 r = test_object(f, extra, needle);
1742 if (r < 0)
1743 return r;
a536e261
LP
1744
1745 if (r == TEST_FOUND)
1746 r = direction == DIRECTION_DOWN ? TEST_RIGHT : TEST_LEFT;
1747
cbdca852
LP
1748 /* if we are looking with DIRECTION_UP then we need to first
1749 see if in the actual array there is a matching entry, and
1750 return the last one of that. But if there isn't any we need
1751 to return this one. Hence remember this, and return it
1752 below. */
1753 if (r == TEST_LEFT)
1754 step_back = direction == DIRECTION_UP;
de190aef 1755
cbdca852
LP
1756 if (r == TEST_RIGHT) {
1757 if (direction == DIRECTION_DOWN)
1758 goto found;
1759 else
1760 return 0;
a536e261 1761 }
cec736d2 1762
de190aef
LP
1763 r = generic_array_bisect(f, first, n-1, needle, test_object, direction, ret, offset, idx);
1764
cbdca852
LP
1765 if (r == 0 && step_back)
1766 goto found;
1767
ecf68b1d 1768 if (r > 0 && idx)
de190aef
LP
1769 (*idx) ++;
1770
1771 return r;
cbdca852
LP
1772
1773found:
1774 r = journal_file_move_to_object(f, OBJECT_ENTRY, extra, &o);
1775 if (r < 0)
1776 return r;
1777
1778 if (ret)
1779 *ret = o;
1780
1781 if (offset)
1782 *offset = extra;
1783
1784 if (idx)
1785 *idx = 0;
1786
1787 return 1;
1788}
1789
44a6b1b6 1790_pure_ static int test_object_offset(JournalFile *f, uint64_t p, uint64_t needle) {
cbdca852
LP
1791 assert(f);
1792 assert(p > 0);
1793
1794 if (p == needle)
1795 return TEST_FOUND;
1796 else if (p < needle)
1797 return TEST_LEFT;
1798 else
1799 return TEST_RIGHT;
1800}
1801
de190aef
LP
1802static int test_object_seqnum(JournalFile *f, uint64_t p, uint64_t needle) {
1803 Object *o;
1804 int r;
1805
1806 assert(f);
1807 assert(p > 0);
1808
1809 r = journal_file_move_to_object(f, OBJECT_ENTRY, p, &o);
cec736d2
LP
1810 if (r < 0)
1811 return r;
1812
de190aef
LP
1813 if (le64toh(o->entry.seqnum) == needle)
1814 return TEST_FOUND;
1815 else if (le64toh(o->entry.seqnum) < needle)
1816 return TEST_LEFT;
1817 else
1818 return TEST_RIGHT;
1819}
cec736d2 1820
de190aef
LP
1821int journal_file_move_to_entry_by_seqnum(
1822 JournalFile *f,
1823 uint64_t seqnum,
1824 direction_t direction,
1825 Object **ret,
1826 uint64_t *offset) {
1827
1828 return generic_array_bisect(f,
1829 le64toh(f->header->entry_array_offset),
1830 le64toh(f->header->n_entries),
1831 seqnum,
1832 test_object_seqnum,
1833 direction,
1834 ret, offset, NULL);
1835}
cec736d2 1836
de190aef
LP
1837static int test_object_realtime(JournalFile *f, uint64_t p, uint64_t needle) {
1838 Object *o;
1839 int r;
1840
1841 assert(f);
1842 assert(p > 0);
1843
1844 r = journal_file_move_to_object(f, OBJECT_ENTRY, p, &o);
1845 if (r < 0)
1846 return r;
1847
1848 if (le64toh(o->entry.realtime) == needle)
1849 return TEST_FOUND;
1850 else if (le64toh(o->entry.realtime) < needle)
1851 return TEST_LEFT;
1852 else
1853 return TEST_RIGHT;
cec736d2
LP
1854}
1855
de190aef
LP
1856int journal_file_move_to_entry_by_realtime(
1857 JournalFile *f,
1858 uint64_t realtime,
1859 direction_t direction,
1860 Object **ret,
1861 uint64_t *offset) {
1862
1863 return generic_array_bisect(f,
1864 le64toh(f->header->entry_array_offset),
1865 le64toh(f->header->n_entries),
1866 realtime,
1867 test_object_realtime,
1868 direction,
1869 ret, offset, NULL);
1870}
1871
1872static int test_object_monotonic(JournalFile *f, uint64_t p, uint64_t needle) {
1873 Object *o;
1874 int r;
1875
1876 assert(f);
1877 assert(p > 0);
1878
1879 r = journal_file_move_to_object(f, OBJECT_ENTRY, p, &o);
1880 if (r < 0)
1881 return r;
1882
1883 if (le64toh(o->entry.monotonic) == needle)
1884 return TEST_FOUND;
1885 else if (le64toh(o->entry.monotonic) < needle)
1886 return TEST_LEFT;
1887 else
1888 return TEST_RIGHT;
1889}
1890
47838ab3
ZJS
1891static inline int find_data_object_by_boot_id(
1892 JournalFile *f,
1893 sd_id128_t boot_id,
1894 Object **o,
1895 uint64_t *b) {
1896 char t[sizeof("_BOOT_ID=")-1 + 32 + 1] = "_BOOT_ID=";
1897
1898 sd_id128_to_string(boot_id, t + 9);
1899 return journal_file_find_data_object(f, t, sizeof(t) - 1, o, b);
1900}
1901
de190aef
LP
1902int journal_file_move_to_entry_by_monotonic(
1903 JournalFile *f,
1904 sd_id128_t boot_id,
1905 uint64_t monotonic,
1906 direction_t direction,
1907 Object **ret,
1908 uint64_t *offset) {
1909
de190aef
LP
1910 Object *o;
1911 int r;
1912
cbdca852 1913 assert(f);
de190aef 1914
47838ab3 1915 r = find_data_object_by_boot_id(f, boot_id, &o, NULL);
de190aef
LP
1916 if (r < 0)
1917 return r;
cbdca852 1918 if (r == 0)
de190aef
LP
1919 return -ENOENT;
1920
1921 return generic_array_bisect_plus_one(f,
1922 le64toh(o->data.entry_offset),
1923 le64toh(o->data.entry_array_offset),
1924 le64toh(o->data.n_entries),
1925 monotonic,
1926 test_object_monotonic,
1927 direction,
1928 ret, offset, NULL);
1929}
1930
de190aef
LP
1931int journal_file_next_entry(
1932 JournalFile *f,
1933 Object *o, uint64_t p,
1934 direction_t direction,
1935 Object **ret, uint64_t *offset) {
1936
fb099c8d 1937 uint64_t i, n, ofs;
cec736d2
LP
1938 int r;
1939
1940 assert(f);
de190aef
LP
1941 assert(p > 0 || !o);
1942
1943 n = le64toh(f->header->n_entries);
1944 if (n <= 0)
1945 return 0;
cec736d2
LP
1946
1947 if (!o)
de190aef 1948 i = direction == DIRECTION_DOWN ? 0 : n - 1;
cec736d2 1949 else {
de190aef 1950 if (o->object.type != OBJECT_ENTRY)
cec736d2
LP
1951 return -EINVAL;
1952
de190aef
LP
1953 r = generic_array_bisect(f,
1954 le64toh(f->header->entry_array_offset),
1955 le64toh(f->header->n_entries),
1956 p,
1957 test_object_offset,
1958 DIRECTION_DOWN,
1959 NULL, NULL,
1960 &i);
1961 if (r <= 0)
1962 return r;
1963
1964 if (direction == DIRECTION_DOWN) {
1965 if (i >= n - 1)
1966 return 0;
1967
1968 i++;
1969 } else {
1970 if (i <= 0)
1971 return 0;
1972
1973 i--;
1974 }
cec736d2
LP
1975 }
1976
de190aef 1977 /* And jump to it */
fb099c8d
ZJS
1978 r = generic_array_get(f,
1979 le64toh(f->header->entry_array_offset),
1980 i,
1981 ret, &ofs);
1982 if (r <= 0)
1983 return r;
1984
1985 if (p > 0 &&
1986 (direction == DIRECTION_DOWN ? ofs <= p : ofs >= p)) {
1987 log_debug("%s: entry array corrupted at entry %"PRIu64,
1988 f->path, i);
1989 return -EBADMSG;
1990 }
1991
1992 if (offset)
1993 *offset = ofs;
1994
1995 return 1;
de190aef 1996}
cec736d2 1997
de190aef
LP
1998int journal_file_next_entry_for_data(
1999 JournalFile *f,
2000 Object *o, uint64_t p,
2001 uint64_t data_offset,
2002 direction_t direction,
2003 Object **ret, uint64_t *offset) {
2004
2005 uint64_t n, i;
cec736d2 2006 int r;
de190aef 2007 Object *d;
cec736d2
LP
2008
2009 assert(f);
de190aef 2010 assert(p > 0 || !o);
cec736d2 2011
de190aef 2012 r = journal_file_move_to_object(f, OBJECT_DATA, data_offset, &d);
466ccd92 2013 if (r < 0)
de190aef 2014 return r;
cec736d2 2015
de190aef
LP
2016 n = le64toh(d->data.n_entries);
2017 if (n <= 0)
2018 return n;
cec736d2 2019
de190aef
LP
2020 if (!o)
2021 i = direction == DIRECTION_DOWN ? 0 : n - 1;
2022 else {
2023 if (o->object.type != OBJECT_ENTRY)
2024 return -EINVAL;
cec736d2 2025
de190aef
LP
2026 r = generic_array_bisect_plus_one(f,
2027 le64toh(d->data.entry_offset),
2028 le64toh(d->data.entry_array_offset),
2029 le64toh(d->data.n_entries),
2030 p,
2031 test_object_offset,
2032 DIRECTION_DOWN,
2033 NULL, NULL,
2034 &i);
2035
2036 if (r <= 0)
cec736d2
LP
2037 return r;
2038
de190aef
LP
2039 if (direction == DIRECTION_DOWN) {
2040 if (i >= n - 1)
2041 return 0;
cec736d2 2042
de190aef
LP
2043 i++;
2044 } else {
2045 if (i <= 0)
2046 return 0;
cec736d2 2047
de190aef
LP
2048 i--;
2049 }
cec736d2 2050
de190aef 2051 }
cec736d2 2052
de190aef
LP
2053 return generic_array_get_plus_one(f,
2054 le64toh(d->data.entry_offset),
2055 le64toh(d->data.entry_array_offset),
2056 i,
2057 ret, offset);
2058}
cec736d2 2059
cbdca852
LP
2060int journal_file_move_to_entry_by_offset_for_data(
2061 JournalFile *f,
2062 uint64_t data_offset,
2063 uint64_t p,
2064 direction_t direction,
2065 Object **ret, uint64_t *offset) {
2066
2067 int r;
2068 Object *d;
2069
2070 assert(f);
2071
2072 r = journal_file_move_to_object(f, OBJECT_DATA, data_offset, &d);
2073 if (r < 0)
2074 return r;
2075
2076 return generic_array_bisect_plus_one(f,
2077 le64toh(d->data.entry_offset),
2078 le64toh(d->data.entry_array_offset),
2079 le64toh(d->data.n_entries),
2080 p,
2081 test_object_offset,
2082 direction,
2083 ret, offset, NULL);
2084}
2085
2086int journal_file_move_to_entry_by_monotonic_for_data(
2087 JournalFile *f,
2088 uint64_t data_offset,
2089 sd_id128_t boot_id,
2090 uint64_t monotonic,
2091 direction_t direction,
2092 Object **ret, uint64_t *offset) {
2093
cbdca852
LP
2094 Object *o, *d;
2095 int r;
2096 uint64_t b, z;
2097
2098 assert(f);
2099
2100 /* First, seek by time */
47838ab3 2101 r = find_data_object_by_boot_id(f, boot_id, &o, &b);
cbdca852
LP
2102 if (r < 0)
2103 return r;
2104 if (r == 0)
2105 return -ENOENT;
2106
2107 r = generic_array_bisect_plus_one(f,
2108 le64toh(o->data.entry_offset),
2109 le64toh(o->data.entry_array_offset),
2110 le64toh(o->data.n_entries),
2111 monotonic,
2112 test_object_monotonic,
2113 direction,
2114 NULL, &z, NULL);
2115 if (r <= 0)
2116 return r;
2117
2118 /* And now, continue seeking until we find an entry that
2119 * exists in both bisection arrays */
2120
2121 for (;;) {
2122 Object *qo;
2123 uint64_t p, q;
2124
2125 r = journal_file_move_to_object(f, OBJECT_DATA, data_offset, &d);
2126 if (r < 0)
2127 return r;
2128
2129 r = generic_array_bisect_plus_one(f,
2130 le64toh(d->data.entry_offset),
2131 le64toh(d->data.entry_array_offset),
2132 le64toh(d->data.n_entries),
2133 z,
2134 test_object_offset,
2135 direction,
2136 NULL, &p, NULL);
2137 if (r <= 0)
2138 return r;
2139
2140 r = journal_file_move_to_object(f, OBJECT_DATA, b, &o);
2141 if (r < 0)
2142 return r;
2143
2144 r = generic_array_bisect_plus_one(f,
2145 le64toh(o->data.entry_offset),
2146 le64toh(o->data.entry_array_offset),
2147 le64toh(o->data.n_entries),
2148 p,
2149 test_object_offset,
2150 direction,
2151 &qo, &q, NULL);
2152
2153 if (r <= 0)
2154 return r;
2155
2156 if (p == q) {
2157 if (ret)
2158 *ret = qo;
2159 if (offset)
2160 *offset = q;
2161
2162 return 1;
2163 }
2164
2165 z = q;
2166 }
cbdca852
LP
2167}
2168
de190aef
LP
2169int journal_file_move_to_entry_by_seqnum_for_data(
2170 JournalFile *f,
2171 uint64_t data_offset,
2172 uint64_t seqnum,
2173 direction_t direction,
2174 Object **ret, uint64_t *offset) {
cec736d2 2175
de190aef
LP
2176 Object *d;
2177 int r;
cec736d2 2178
91a31dde
LP
2179 assert(f);
2180
de190aef 2181 r = journal_file_move_to_object(f, OBJECT_DATA, data_offset, &d);
91a31dde 2182 if (r < 0)
de190aef 2183 return r;
cec736d2 2184
de190aef
LP
2185 return generic_array_bisect_plus_one(f,
2186 le64toh(d->data.entry_offset),
2187 le64toh(d->data.entry_array_offset),
2188 le64toh(d->data.n_entries),
2189 seqnum,
2190 test_object_seqnum,
2191 direction,
2192 ret, offset, NULL);
2193}
cec736d2 2194
de190aef
LP
2195int journal_file_move_to_entry_by_realtime_for_data(
2196 JournalFile *f,
2197 uint64_t data_offset,
2198 uint64_t realtime,
2199 direction_t direction,
2200 Object **ret, uint64_t *offset) {
2201
2202 Object *d;
2203 int r;
2204
91a31dde
LP
2205 assert(f);
2206
de190aef 2207 r = journal_file_move_to_object(f, OBJECT_DATA, data_offset, &d);
91a31dde 2208 if (r < 0)
de190aef
LP
2209 return r;
2210
2211 return generic_array_bisect_plus_one(f,
2212 le64toh(d->data.entry_offset),
2213 le64toh(d->data.entry_array_offset),
2214 le64toh(d->data.n_entries),
2215 realtime,
2216 test_object_realtime,
2217 direction,
2218 ret, offset, NULL);
cec736d2
LP
2219}
2220
0284adc6 2221void journal_file_dump(JournalFile *f) {
7560fffc 2222 Object *o;
7560fffc 2223 int r;
0284adc6 2224 uint64_t p;
7560fffc
LP
2225
2226 assert(f);
2227
0284adc6 2228 journal_file_print_header(f);
7560fffc 2229
0284adc6
LP
2230 p = le64toh(f->header->header_size);
2231 while (p != 0) {
d05089d8 2232 r = journal_file_move_to_object(f, OBJECT_UNUSED, p, &o);
0284adc6
LP
2233 if (r < 0)
2234 goto fail;
7560fffc 2235
0284adc6 2236 switch (o->object.type) {
d98cc1f2 2237
0284adc6
LP
2238 case OBJECT_UNUSED:
2239 printf("Type: OBJECT_UNUSED\n");
2240 break;
d98cc1f2 2241
0284adc6
LP
2242 case OBJECT_DATA:
2243 printf("Type: OBJECT_DATA\n");
2244 break;
7560fffc 2245
3c1668da
LP
2246 case OBJECT_FIELD:
2247 printf("Type: OBJECT_FIELD\n");
2248 break;
2249
0284adc6 2250 case OBJECT_ENTRY:
507f22bd
ZJS
2251 printf("Type: OBJECT_ENTRY seqnum=%"PRIu64" monotonic=%"PRIu64" realtime=%"PRIu64"\n",
2252 le64toh(o->entry.seqnum),
2253 le64toh(o->entry.monotonic),
2254 le64toh(o->entry.realtime));
0284adc6 2255 break;
7560fffc 2256
0284adc6
LP
2257 case OBJECT_FIELD_HASH_TABLE:
2258 printf("Type: OBJECT_FIELD_HASH_TABLE\n");
2259 break;
7560fffc 2260
0284adc6
LP
2261 case OBJECT_DATA_HASH_TABLE:
2262 printf("Type: OBJECT_DATA_HASH_TABLE\n");
2263 break;
7560fffc 2264
0284adc6
LP
2265 case OBJECT_ENTRY_ARRAY:
2266 printf("Type: OBJECT_ENTRY_ARRAY\n");
2267 break;
7560fffc 2268
0284adc6 2269 case OBJECT_TAG:
507f22bd
ZJS
2270 printf("Type: OBJECT_TAG seqnum=%"PRIu64" epoch=%"PRIu64"\n",
2271 le64toh(o->tag.seqnum),
2272 le64toh(o->tag.epoch));
0284adc6 2273 break;
3c1668da
LP
2274
2275 default:
2276 printf("Type: unknown (%u)\n", o->object.type);
2277 break;
0284adc6 2278 }
7560fffc 2279
d89c8fdf
ZJS
2280 if (o->object.flags & OBJECT_COMPRESSION_MASK)
2281 printf("Flags: %s\n",
2282 object_compressed_to_string(o->object.flags & OBJECT_COMPRESSION_MASK));
7560fffc 2283
0284adc6
LP
2284 if (p == le64toh(f->header->tail_object_offset))
2285 p = 0;
2286 else
2287 p = p + ALIGN64(le64toh(o->object.size));
2288 }
7560fffc 2289
0284adc6
LP
2290 return;
2291fail:
2292 log_error("File corrupt");
7560fffc
LP
2293}
2294
718fe4b1
ZJS
2295static const char* format_timestamp_safe(char *buf, size_t l, usec_t t) {
2296 const char *x;
2297
2298 x = format_timestamp(buf, l, t);
2299 if (x)
2300 return x;
2301 return " --- ";
2302}
2303
0284adc6 2304void journal_file_print_header(JournalFile *f) {
2765b7bb 2305 char a[33], b[33], c[33], d[33];
ed375beb 2306 char x[FORMAT_TIMESTAMP_MAX], y[FORMAT_TIMESTAMP_MAX], z[FORMAT_TIMESTAMP_MAX];
a1a03e30
LP
2307 struct stat st;
2308 char bytes[FORMAT_BYTES_MAX];
7560fffc
LP
2309
2310 assert(f);
7560fffc 2311
0284adc6
LP
2312 printf("File Path: %s\n"
2313 "File ID: %s\n"
2314 "Machine ID: %s\n"
2315 "Boot ID: %s\n"
2316 "Sequential Number ID: %s\n"
2317 "State: %s\n"
2318 "Compatible Flags:%s%s\n"
d89c8fdf 2319 "Incompatible Flags:%s%s%s\n"
507f22bd
ZJS
2320 "Header size: %"PRIu64"\n"
2321 "Arena size: %"PRIu64"\n"
2322 "Data Hash Table Size: %"PRIu64"\n"
2323 "Field Hash Table Size: %"PRIu64"\n"
0284adc6 2324 "Rotate Suggested: %s\n"
507f22bd
ZJS
2325 "Head Sequential Number: %"PRIu64"\n"
2326 "Tail Sequential Number: %"PRIu64"\n"
0284adc6 2327 "Head Realtime Timestamp: %s\n"
3223f44f 2328 "Tail Realtime Timestamp: %s\n"
ed375beb 2329 "Tail Monotonic Timestamp: %s\n"
507f22bd
ZJS
2330 "Objects: %"PRIu64"\n"
2331 "Entry Objects: %"PRIu64"\n",
0284adc6
LP
2332 f->path,
2333 sd_id128_to_string(f->header->file_id, a),
2334 sd_id128_to_string(f->header->machine_id, b),
2335 sd_id128_to_string(f->header->boot_id, c),
2765b7bb 2336 sd_id128_to_string(f->header->seqnum_id, d),
3223f44f
LP
2337 f->header->state == STATE_OFFLINE ? "OFFLINE" :
2338 f->header->state == STATE_ONLINE ? "ONLINE" :
2339 f->header->state == STATE_ARCHIVED ? "ARCHIVED" : "UNKNOWN",
8088cbd3 2340 JOURNAL_HEADER_SEALED(f->header) ? " SEALED" : "",
d89c8fdf
ZJS
2341 (le32toh(f->header->compatible_flags) & ~HEADER_COMPATIBLE_ANY) ? " ???" : "",
2342 JOURNAL_HEADER_COMPRESSED_XZ(f->header) ? " COMPRESSED-XZ" : "",
2343 JOURNAL_HEADER_COMPRESSED_LZ4(f->header) ? " COMPRESSED-LZ4" : "",
2344 (le32toh(f->header->incompatible_flags) & ~HEADER_INCOMPATIBLE_ANY) ? " ???" : "",
507f22bd
ZJS
2345 le64toh(f->header->header_size),
2346 le64toh(f->header->arena_size),
2347 le64toh(f->header->data_hash_table_size) / sizeof(HashItem),
2348 le64toh(f->header->field_hash_table_size) / sizeof(HashItem),
fb0951b0 2349 yes_no(journal_file_rotate_suggested(f, 0)),
507f22bd
ZJS
2350 le64toh(f->header->head_entry_seqnum),
2351 le64toh(f->header->tail_entry_seqnum),
718fe4b1
ZJS
2352 format_timestamp_safe(x, sizeof(x), le64toh(f->header->head_entry_realtime)),
2353 format_timestamp_safe(y, sizeof(y), le64toh(f->header->tail_entry_realtime)),
ed375beb 2354 format_timespan(z, sizeof(z), le64toh(f->header->tail_entry_monotonic), USEC_PER_MSEC),
507f22bd
ZJS
2355 le64toh(f->header->n_objects),
2356 le64toh(f->header->n_entries));
7560fffc 2357
0284adc6 2358 if (JOURNAL_HEADER_CONTAINS(f->header, n_data))
507f22bd 2359 printf("Data Objects: %"PRIu64"\n"
0284adc6 2360 "Data Hash Table Fill: %.1f%%\n",
507f22bd 2361 le64toh(f->header->n_data),
0284adc6 2362 100.0 * (double) le64toh(f->header->n_data) / ((double) (le64toh(f->header->data_hash_table_size) / sizeof(HashItem))));
7560fffc 2363
0284adc6 2364 if (JOURNAL_HEADER_CONTAINS(f->header, n_fields))
507f22bd 2365 printf("Field Objects: %"PRIu64"\n"
0284adc6 2366 "Field Hash Table Fill: %.1f%%\n",
507f22bd 2367 le64toh(f->header->n_fields),
0284adc6 2368 100.0 * (double) le64toh(f->header->n_fields) / ((double) (le64toh(f->header->field_hash_table_size) / sizeof(HashItem))));
3223f44f
LP
2369
2370 if (JOURNAL_HEADER_CONTAINS(f->header, n_tags))
507f22bd
ZJS
2371 printf("Tag Objects: %"PRIu64"\n",
2372 le64toh(f->header->n_tags));
3223f44f 2373 if (JOURNAL_HEADER_CONTAINS(f->header, n_entry_arrays))
507f22bd
ZJS
2374 printf("Entry Array Objects: %"PRIu64"\n",
2375 le64toh(f->header->n_entry_arrays));
a1a03e30
LP
2376
2377 if (fstat(f->fd, &st) >= 0)
2378 printf("Disk usage: %s\n", format_bytes(bytes, sizeof(bytes), (off_t) st.st_blocks * 512ULL));
7560fffc
LP
2379}
2380
0284adc6
LP
2381int journal_file_open(
2382 const char *fname,
2383 int flags,
2384 mode_t mode,
2385 bool compress,
baed47c3 2386 bool seal,
0284adc6
LP
2387 JournalMetrics *metrics,
2388 MMapCache *mmap_cache,
2389 JournalFile *template,
2390 JournalFile **ret) {
7560fffc 2391
0284adc6
LP
2392 JournalFile *f;
2393 int r;
2394 bool newly_created = false;
7560fffc 2395
0284adc6 2396 assert(fname);
0559d3a5 2397 assert(ret);
7560fffc 2398
0284adc6
LP
2399 if ((flags & O_ACCMODE) != O_RDONLY &&
2400 (flags & O_ACCMODE) != O_RDWR)
2401 return -EINVAL;
7560fffc 2402
a0108012
LP
2403 if (!endswith(fname, ".journal") &&
2404 !endswith(fname, ".journal~"))
0284adc6 2405 return -EINVAL;
7560fffc 2406
0284adc6
LP
2407 f = new0(JournalFile, 1);
2408 if (!f)
2409 return -ENOMEM;
7560fffc 2410
0284adc6
LP
2411 f->fd = -1;
2412 f->mode = mode;
7560fffc 2413
0284adc6
LP
2414 f->flags = flags;
2415 f->prot = prot_from_flags(flags);
2416 f->writable = (flags & O_ACCMODE) != O_RDONLY;
92261977 2417#if defined(HAVE_LZ4)
d89c8fdf
ZJS
2418 f->compress_lz4 = compress;
2419#elif defined(HAVE_XZ)
2420 f->compress_xz = compress;
48b61739 2421#endif
49a32d43 2422#ifdef HAVE_GCRYPT
baed47c3 2423 f->seal = seal;
49a32d43 2424#endif
7560fffc 2425
0284adc6
LP
2426 if (mmap_cache)
2427 f->mmap = mmap_cache_ref(mmap_cache);
2428 else {
84168d80 2429 f->mmap = mmap_cache_new();
0284adc6
LP
2430 if (!f->mmap) {
2431 r = -ENOMEM;
2432 goto fail;
2433 }
2434 }
7560fffc 2435
0284adc6
LP
2436 f->path = strdup(fname);
2437 if (!f->path) {
2438 r = -ENOMEM;
2439 goto fail;
2440 }
7560fffc 2441
4743015d 2442 f->chain_cache = ordered_hashmap_new(&uint64_hash_ops);
a4bcff5b
LP
2443 if (!f->chain_cache) {
2444 r = -ENOMEM;
2445 goto fail;
2446 }
2447
0284adc6
LP
2448 f->fd = open(f->path, f->flags|O_CLOEXEC, f->mode);
2449 if (f->fd < 0) {
2450 r = -errno;
2451 goto fail;
7560fffc 2452 }
7560fffc 2453
0284adc6
LP
2454 if (fstat(f->fd, &f->last_stat) < 0) {
2455 r = -errno;
2456 goto fail;
2457 }
7560fffc 2458
0284adc6 2459 if (f->last_stat.st_size == 0 && f->writable) {
fb0951b0
LP
2460 uint64_t crtime;
2461
2462 /* Let's attach the creation time to the journal file,
2463 * so that the vacuuming code knows the age of this
2464 * file even if the file might end up corrupted one
2465 * day... Ideally we'd just use the creation time many
2466 * file systems maintain for each file, but there is
2467 * currently no usable API to query this, hence let's
2468 * emulate this via extended attributes. If extended
2469 * attributes are not supported we'll just skip this,
7517e174 2470 * and rely solely on mtime/atime/ctime of the file. */
fb0951b0
LP
2471
2472 crtime = htole64((uint64_t) now(CLOCK_REALTIME));
2473 fsetxattr(f->fd, "user.crtime_usec", &crtime, sizeof(crtime), XATTR_CREATE);
7560fffc 2474
feb12d3e 2475#ifdef HAVE_GCRYPT
0284adc6 2476 /* Try to load the FSPRG state, and if we can't, then
baed47c3 2477 * just don't do sealing */
49a32d43
LP
2478 if (f->seal) {
2479 r = journal_file_fss_load(f);
2480 if (r < 0)
2481 f->seal = false;
2482 }
feb12d3e 2483#endif
7560fffc 2484
0284adc6
LP
2485 r = journal_file_init_header(f, template);
2486 if (r < 0)
2487 goto fail;
7560fffc 2488
0284adc6
LP
2489 if (fstat(f->fd, &f->last_stat) < 0) {
2490 r = -errno;
2491 goto fail;
2492 }
fb0951b0
LP
2493
2494 newly_created = true;
0284adc6 2495 }
7560fffc 2496
0284adc6
LP
2497 if (f->last_stat.st_size < (off_t) HEADER_SIZE_MIN) {
2498 r = -EIO;
2499 goto fail;
2500 }
7560fffc 2501
0284adc6
LP
2502 f->header = mmap(NULL, PAGE_ALIGN(sizeof(Header)), prot_from_flags(flags), MAP_SHARED, f->fd, 0);
2503 if (f->header == MAP_FAILED) {
2504 f->header = NULL;
2505 r = -errno;
2506 goto fail;
2507 }
7560fffc 2508
0284adc6
LP
2509 if (!newly_created) {
2510 r = journal_file_verify_header(f);
2511 if (r < 0)
2512 goto fail;
2513 }
7560fffc 2514
feb12d3e 2515#ifdef HAVE_GCRYPT
0284adc6 2516 if (!newly_created && f->writable) {
baed47c3 2517 r = journal_file_fss_load(f);
0284adc6
LP
2518 if (r < 0)
2519 goto fail;
2520 }
feb12d3e 2521#endif
cec736d2
LP
2522
2523 if (f->writable) {
4a92baf3
LP
2524 if (metrics) {
2525 journal_default_metrics(metrics, f->fd);
2526 f->metrics = *metrics;
2527 } else if (template)
2528 f->metrics = template->metrics;
2529
cec736d2
LP
2530 r = journal_file_refresh_header(f);
2531 if (r < 0)
2532 goto fail;
2533 }
2534
feb12d3e 2535#ifdef HAVE_GCRYPT
baed47c3 2536 r = journal_file_hmac_setup(f);
14d10188
LP
2537 if (r < 0)
2538 goto fail;
feb12d3e 2539#endif
14d10188 2540
cec736d2 2541 if (newly_created) {
de190aef 2542 r = journal_file_setup_field_hash_table(f);
cec736d2
LP
2543 if (r < 0)
2544 goto fail;
2545
de190aef 2546 r = journal_file_setup_data_hash_table(f);
cec736d2
LP
2547 if (r < 0)
2548 goto fail;
7560fffc 2549
feb12d3e 2550#ifdef HAVE_GCRYPT
7560fffc
LP
2551 r = journal_file_append_first_tag(f);
2552 if (r < 0)
2553 goto fail;
feb12d3e 2554#endif
cec736d2
LP
2555 }
2556
de190aef 2557 r = journal_file_map_field_hash_table(f);
cec736d2
LP
2558 if (r < 0)
2559 goto fail;
2560
de190aef 2561 r = journal_file_map_data_hash_table(f);
cec736d2
LP
2562 if (r < 0)
2563 goto fail;
2564
0559d3a5 2565 *ret = f;
cec736d2
LP
2566 return 0;
2567
2568fail:
2569 journal_file_close(f);
2570
2571 return r;
2572}
0ac38b70 2573
baed47c3 2574int journal_file_rotate(JournalFile **f, bool compress, bool seal) {
57535f47 2575 _cleanup_free_ char *p = NULL;
0ac38b70
LP
2576 size_t l;
2577 JournalFile *old_file, *new_file = NULL;
2578 int r;
2579
2580 assert(f);
2581 assert(*f);
2582
2583 old_file = *f;
2584
2585 if (!old_file->writable)
2586 return -EINVAL;
2587
2588 if (!endswith(old_file->path, ".journal"))
2589 return -EINVAL;
2590
2591 l = strlen(old_file->path);
57535f47
ZJS
2592 r = asprintf(&p, "%.*s@" SD_ID128_FORMAT_STR "-%016"PRIx64"-%016"PRIx64".journal",
2593 (int) l - 8, old_file->path,
2594 SD_ID128_FORMAT_VAL(old_file->header->seqnum_id),
2595 le64toh((*f)->header->head_entry_seqnum),
2596 le64toh((*f)->header->head_entry_realtime));
2597 if (r < 0)
0ac38b70
LP
2598 return -ENOMEM;
2599
0ac38b70 2600 r = rename(old_file->path, p);
0ac38b70
LP
2601 if (r < 0)
2602 return -errno;
2603
ccdbaf91 2604 old_file->header->state = STATE_ARCHIVED;
0ac38b70 2605
baed47c3 2606 r = journal_file_open(old_file->path, old_file->flags, old_file->mode, compress, seal, NULL, old_file->mmap, old_file, &new_file);
0ac38b70
LP
2607 journal_file_close(old_file);
2608
2609 *f = new_file;
2610 return r;
2611}
2612
9447a7f1
LP
2613int journal_file_open_reliably(
2614 const char *fname,
2615 int flags,
2616 mode_t mode,
7560fffc 2617 bool compress,
baed47c3 2618 bool seal,
4a92baf3 2619 JournalMetrics *metrics,
27370278 2620 MMapCache *mmap_cache,
9447a7f1
LP
2621 JournalFile *template,
2622 JournalFile **ret) {
2623
2624 int r;
2625 size_t l;
ed375beb 2626 _cleanup_free_ char *p = NULL;
9447a7f1 2627
baed47c3 2628 r = journal_file_open(fname, flags, mode, compress, seal,
27370278 2629 metrics, mmap_cache, template, ret);
0071d9f1
LP
2630 if (r != -EBADMSG && /* corrupted */
2631 r != -ENODATA && /* truncated */
2632 r != -EHOSTDOWN && /* other machine */
a1a1898f
LP
2633 r != -EPROTONOSUPPORT && /* incompatible feature */
2634 r != -EBUSY && /* unclean shutdown */
2635 r != -ESHUTDOWN /* already archived */)
9447a7f1
LP
2636 return r;
2637
2638 if ((flags & O_ACCMODE) == O_RDONLY)
2639 return r;
2640
2641 if (!(flags & O_CREAT))
2642 return r;
2643
7560fffc
LP
2644 if (!endswith(fname, ".journal"))
2645 return r;
2646
5c70eab4
LP
2647 /* The file is corrupted. Rotate it away and try it again (but only once) */
2648
9447a7f1 2649 l = strlen(fname);
9bf3b535 2650 if (asprintf(&p, "%.*s@%016llx-%016" PRIx64 ".journal~",
57535f47 2651 (int) l - 8, fname,
9447a7f1 2652 (unsigned long long) now(CLOCK_REALTIME),
9bf3b535 2653 random_u64()) < 0)
9447a7f1
LP
2654 return -ENOMEM;
2655
2656 r = rename(fname, p);
9447a7f1
LP
2657 if (r < 0)
2658 return -errno;
2659
a1a1898f 2660 log_warning("File %s corrupted or uncleanly shut down, renaming and replacing.", fname);
9447a7f1 2661
baed47c3 2662 return journal_file_open(fname, flags, mode, compress, seal,
27370278 2663 metrics, mmap_cache, template, ret);
9447a7f1
LP
2664}
2665
cf244689
LP
2666int journal_file_copy_entry(JournalFile *from, JournalFile *to, Object *o, uint64_t p, uint64_t *seqnum, Object **ret, uint64_t *offset) {
2667 uint64_t i, n;
2668 uint64_t q, xor_hash = 0;
2669 int r;
2670 EntryItem *items;
2671 dual_timestamp ts;
2672
2673 assert(from);
2674 assert(to);
2675 assert(o);
2676 assert(p);
2677
2678 if (!to->writable)
2679 return -EPERM;
2680
2681 ts.monotonic = le64toh(o->entry.monotonic);
2682 ts.realtime = le64toh(o->entry.realtime);
2683
cf244689 2684 n = journal_file_entry_n_items(o);
4faa7004
TA
2685 /* alloca() can't take 0, hence let's allocate at least one */
2686 items = alloca(sizeof(EntryItem) * MAX(1u, n));
cf244689
LP
2687
2688 for (i = 0; i < n; i++) {
4fd052ae
FC
2689 uint64_t l, h;
2690 le64_t le_hash;
cf244689
LP
2691 size_t t;
2692 void *data;
2693 Object *u;
2694
2695 q = le64toh(o->entry.items[i].object_offset);
2696 le_hash = o->entry.items[i].hash;
2697
2698 r = journal_file_move_to_object(from, OBJECT_DATA, q, &o);
2699 if (r < 0)
2700 return r;
2701
2702 if (le_hash != o->data.hash)
2703 return -EBADMSG;
2704
2705 l = le64toh(o->object.size) - offsetof(Object, data.payload);
2706 t = (size_t) l;
2707
2708 /* We hit the limit on 32bit machines */
2709 if ((uint64_t) t != l)
2710 return -E2BIG;
2711
d89c8fdf 2712 if (o->object.flags & OBJECT_COMPRESSION_MASK) {
3b1a55e1 2713#if defined(HAVE_XZ) || defined(HAVE_LZ4)
fa1c4b51 2714 size_t rsize;
cf244689 2715
d89c8fdf
ZJS
2716 r = decompress_blob(o->object.flags & OBJECT_COMPRESSION_MASK,
2717 o->data.payload, l, &from->compress_buffer, &from->compress_buffer_size, &rsize, 0);
2718 if (r < 0)
2719 return r;
cf244689
LP
2720
2721 data = from->compress_buffer;
2722 l = rsize;
3b1a55e1
ZJS
2723#else
2724 return -EPROTONOSUPPORT;
2725#endif
cf244689
LP
2726 } else
2727 data = o->data.payload;
2728
2729 r = journal_file_append_data(to, data, l, &u, &h);
2730 if (r < 0)
2731 return r;
2732
2733 xor_hash ^= le64toh(u->data.hash);
2734 items[i].object_offset = htole64(h);
2735 items[i].hash = u->data.hash;
2736
2737 r = journal_file_move_to_object(from, OBJECT_ENTRY, p, &o);
2738 if (r < 0)
2739 return r;
2740 }
2741
2742 return journal_file_append_entry_internal(to, &ts, xor_hash, items, n, seqnum, ret, offset);
2743}
babfc091
LP
2744
2745void journal_default_metrics(JournalMetrics *m, int fd) {
2746 uint64_t fs_size = 0;
2747 struct statvfs ss;
a7bc2c2a 2748 char a[FORMAT_BYTES_MAX], b[FORMAT_BYTES_MAX], c[FORMAT_BYTES_MAX], d[FORMAT_BYTES_MAX];
babfc091
LP
2749
2750 assert(m);
2751 assert(fd >= 0);
2752
2753 if (fstatvfs(fd, &ss) >= 0)
2754 fs_size = ss.f_frsize * ss.f_blocks;
2755
2756 if (m->max_use == (uint64_t) -1) {
2757
2758 if (fs_size > 0) {
2759 m->max_use = PAGE_ALIGN(fs_size / 10); /* 10% of file system size */
2760
2761 if (m->max_use > DEFAULT_MAX_USE_UPPER)
2762 m->max_use = DEFAULT_MAX_USE_UPPER;
2763
2764 if (m->max_use < DEFAULT_MAX_USE_LOWER)
2765 m->max_use = DEFAULT_MAX_USE_LOWER;
2766 } else
2767 m->max_use = DEFAULT_MAX_USE_LOWER;
2768 } else {
2769 m->max_use = PAGE_ALIGN(m->max_use);
2770
2771 if (m->max_use < JOURNAL_FILE_SIZE_MIN*2)
2772 m->max_use = JOURNAL_FILE_SIZE_MIN*2;
2773 }
2774
2775 if (m->max_size == (uint64_t) -1) {
2776 m->max_size = PAGE_ALIGN(m->max_use / 8); /* 8 chunks */
2777
2778 if (m->max_size > DEFAULT_MAX_SIZE_UPPER)
2779 m->max_size = DEFAULT_MAX_SIZE_UPPER;
2780 } else
2781 m->max_size = PAGE_ALIGN(m->max_size);
2782
2783 if (m->max_size < JOURNAL_FILE_SIZE_MIN)
2784 m->max_size = JOURNAL_FILE_SIZE_MIN;
2785
2786 if (m->max_size*2 > m->max_use)
2787 m->max_use = m->max_size*2;
2788
2789 if (m->min_size == (uint64_t) -1)
2790 m->min_size = JOURNAL_FILE_SIZE_MIN;
2791 else {
2792 m->min_size = PAGE_ALIGN(m->min_size);
2793
2794 if (m->min_size < JOURNAL_FILE_SIZE_MIN)
2795 m->min_size = JOURNAL_FILE_SIZE_MIN;
2796
2797 if (m->min_size > m->max_size)
2798 m->max_size = m->min_size;
2799 }
2800
2801 if (m->keep_free == (uint64_t) -1) {
2802
2803 if (fs_size > 0) {
8621b110 2804 m->keep_free = PAGE_ALIGN(fs_size * 3 / 20); /* 15% of file system size */
babfc091
LP
2805
2806 if (m->keep_free > DEFAULT_KEEP_FREE_UPPER)
2807 m->keep_free = DEFAULT_KEEP_FREE_UPPER;
2808
2809 } else
2810 m->keep_free = DEFAULT_KEEP_FREE;
2811 }
2812
2b43f939
LP
2813 log_debug("Fixed max_use=%s max_size=%s min_size=%s keep_free=%s",
2814 format_bytes(a, sizeof(a), m->max_use),
2815 format_bytes(b, sizeof(b), m->max_size),
2816 format_bytes(c, sizeof(c), m->min_size),
2817 format_bytes(d, sizeof(d), m->keep_free));
babfc091 2818}
08984293
LP
2819
2820int journal_file_get_cutoff_realtime_usec(JournalFile *f, usec_t *from, usec_t *to) {
08984293
LP
2821 assert(f);
2822 assert(from || to);
2823
2824 if (from) {
162566a4
LP
2825 if (f->header->head_entry_realtime == 0)
2826 return -ENOENT;
08984293 2827
162566a4 2828 *from = le64toh(f->header->head_entry_realtime);
08984293
LP
2829 }
2830
2831 if (to) {
162566a4
LP
2832 if (f->header->tail_entry_realtime == 0)
2833 return -ENOENT;
08984293 2834
162566a4 2835 *to = le64toh(f->header->tail_entry_realtime);
08984293
LP
2836 }
2837
2838 return 1;
2839}
2840
2841int journal_file_get_cutoff_monotonic_usec(JournalFile *f, sd_id128_t boot_id, usec_t *from, usec_t *to) {
08984293
LP
2842 Object *o;
2843 uint64_t p;
2844 int r;
2845
2846 assert(f);
2847 assert(from || to);
2848
47838ab3 2849 r = find_data_object_by_boot_id(f, boot_id, &o, &p);
08984293
LP
2850 if (r <= 0)
2851 return r;
2852
2853 if (le64toh(o->data.n_entries) <= 0)
2854 return 0;
2855
2856 if (from) {
2857 r = journal_file_move_to_object(f, OBJECT_ENTRY, le64toh(o->data.entry_offset), &o);
2858 if (r < 0)
2859 return r;
2860
2861 *from = le64toh(o->entry.monotonic);
2862 }
2863
2864 if (to) {
2865 r = journal_file_move_to_object(f, OBJECT_DATA, p, &o);
2866 if (r < 0)
2867 return r;
2868
2869 r = generic_array_get_plus_one(f,
2870 le64toh(o->data.entry_offset),
2871 le64toh(o->data.entry_array_offset),
2872 le64toh(o->data.n_entries)-1,
2873 &o, NULL);
2874 if (r <= 0)
2875 return r;
2876
2877 *to = le64toh(o->entry.monotonic);
2878 }
2879
2880 return 1;
2881}
dca6219e 2882
fb0951b0 2883bool journal_file_rotate_suggested(JournalFile *f, usec_t max_file_usec) {
dca6219e
LP
2884 assert(f);
2885
2886 /* If we gained new header fields we gained new features,
2887 * hence suggest a rotation */
361f9cbc
LP
2888 if (le64toh(f->header->header_size) < sizeof(Header)) {
2889 log_debug("%s uses an outdated header, suggesting rotation.", f->path);
dca6219e 2890 return true;
361f9cbc 2891 }
dca6219e
LP
2892
2893 /* Let's check if the hash tables grew over a certain fill
2894 * level (75%, borrowing this value from Java's hash table
2895 * implementation), and if so suggest a rotation. To calculate
2896 * the fill level we need the n_data field, which only exists
2897 * in newer versions. */
2898
2899 if (JOURNAL_HEADER_CONTAINS(f->header, n_data))
361f9cbc 2900 if (le64toh(f->header->n_data) * 4ULL > (le64toh(f->header->data_hash_table_size) / sizeof(HashItem)) * 3ULL) {
507f22bd 2901 log_debug("Data hash table of %s has a fill level at %.1f (%"PRIu64" of %"PRIu64" items, %llu file size, %"PRIu64" bytes per hash table item), suggesting rotation.",
361f9cbc
LP
2902 f->path,
2903 100.0 * (double) le64toh(f->header->n_data) / ((double) (le64toh(f->header->data_hash_table_size) / sizeof(HashItem))),
507f22bd
ZJS
2904 le64toh(f->header->n_data),
2905 le64toh(f->header->data_hash_table_size) / sizeof(HashItem),
2906 (unsigned long long) f->last_stat.st_size,
2907 f->last_stat.st_size / le64toh(f->header->n_data));
dca6219e 2908 return true;
361f9cbc 2909 }
dca6219e
LP
2910
2911 if (JOURNAL_HEADER_CONTAINS(f->header, n_fields))
361f9cbc 2912 if (le64toh(f->header->n_fields) * 4ULL > (le64toh(f->header->field_hash_table_size) / sizeof(HashItem)) * 3ULL) {
507f22bd 2913 log_debug("Field hash table of %s has a fill level at %.1f (%"PRIu64" of %"PRIu64" items), suggesting rotation.",
361f9cbc
LP
2914 f->path,
2915 100.0 * (double) le64toh(f->header->n_fields) / ((double) (le64toh(f->header->field_hash_table_size) / sizeof(HashItem))),
507f22bd
ZJS
2916 le64toh(f->header->n_fields),
2917 le64toh(f->header->field_hash_table_size) / sizeof(HashItem));
dca6219e 2918 return true;
361f9cbc 2919 }
dca6219e 2920
0598fd4a
LP
2921 /* Are the data objects properly indexed by field objects? */
2922 if (JOURNAL_HEADER_CONTAINS(f->header, n_data) &&
2923 JOURNAL_HEADER_CONTAINS(f->header, n_fields) &&
2924 le64toh(f->header->n_data) > 0 &&
2925 le64toh(f->header->n_fields) == 0)
2926 return true;
2927
fb0951b0
LP
2928 if (max_file_usec > 0) {
2929 usec_t t, h;
2930
2931 h = le64toh(f->header->head_entry_realtime);
2932 t = now(CLOCK_REALTIME);
2933
2934 if (h > 0 && t > h + max_file_usec)
2935 return true;
2936 }
2937
dca6219e
LP
2938 return false;
2939}