]> git.ipfire.org Git - thirdparty/systemd.git/blame - src/journal/journal-file.c
journal: abstract the resetting of JournalFile's location
[thirdparty/systemd.git] / src / journal / journal-file.c
CommitLineData
cec736d2
LP
1/*-*- Mode: C; c-basic-offset: 8; indent-tabs-mode: nil -*-*/
2
3/***
4 This file is part of systemd.
5
6 Copyright 2011 Lennart Poettering
7
8 systemd is free software; you can redistribute it and/or modify it
5430f7f2
LP
9 under the terms of the GNU Lesser General Public License as published by
10 the Free Software Foundation; either version 2.1 of the License, or
cec736d2
LP
11 (at your option) any later version.
12
13 systemd is distributed in the hope that it will be useful, but
14 WITHOUT ANY WARRANTY; without even the implied warranty of
15 MERCHANTABILITY or FITNESS FOR A PARTICULAR PURPOSE. See the GNU
5430f7f2 16 Lesser General Public License for more details.
cec736d2 17
5430f7f2 18 You should have received a copy of the GNU Lesser General Public License
cec736d2
LP
19 along with systemd; If not, see <http://www.gnu.org/licenses/>.
20***/
21
22#include <sys/mman.h>
23#include <errno.h>
24#include <sys/uio.h>
25#include <unistd.h>
26#include <sys/statvfs.h>
27#include <fcntl.h>
28#include <stddef.h>
d2edfae0 29#include <sys/xattr.h>
fb0951b0 30
cec736d2
LP
31#include "journal-def.h"
32#include "journal-file.h"
0284adc6 33#include "journal-authenticate.h"
cec736d2 34#include "lookup3.h"
807e17f0 35#include "compress.h"
7560fffc 36#include "fsprg.h"
cec736d2 37
4a92baf3
LP
38#define DEFAULT_DATA_HASH_TABLE_SIZE (2047ULL*sizeof(HashItem))
39#define DEFAULT_FIELD_HASH_TABLE_SIZE (333ULL*sizeof(HashItem))
cec736d2 40
be19b7df 41#define COMPRESSION_SIZE_THRESHOLD (512ULL)
807e17f0 42
babfc091 43/* This is the minimum journal file size */
253f59df 44#define JOURNAL_FILE_SIZE_MIN (4ULL*1024ULL*1024ULL) /* 4 MiB */
babfc091
LP
45
46/* These are the lower and upper bounds if we deduce the max_use value
47 * from the file system size */
48#define DEFAULT_MAX_USE_LOWER (1ULL*1024ULL*1024ULL) /* 1 MiB */
49#define DEFAULT_MAX_USE_UPPER (4ULL*1024ULL*1024ULL*1024ULL) /* 4 GiB */
50
51/* This is the upper bound if we deduce max_size from max_use */
71100051 52#define DEFAULT_MAX_SIZE_UPPER (128ULL*1024ULL*1024ULL) /* 128 MiB */
babfc091
LP
53
54/* This is the upper bound if we deduce the keep_free value from the
55 * file system size */
56#define DEFAULT_KEEP_FREE_UPPER (4ULL*1024ULL*1024ULL*1024ULL) /* 4 GiB */
57
58/* This is the keep_free value when we can't determine the system
59 * size */
60#define DEFAULT_KEEP_FREE (1024ULL*1024ULL) /* 1 MB */
61
dca6219e
LP
62/* n_data was the first entry we added after the initial file format design */
63#define HEADER_SIZE_MIN ALIGN64(offsetof(Header, n_data))
cec736d2 64
a4bcff5b
LP
65/* How many entries to keep in the entry array chain cache at max */
66#define CHAIN_CACHE_MAX 20
67
a676e665
LP
68/* How much to increase the journal file size at once each time we allocate something new. */
69#define FILE_SIZE_INCREASE (8ULL*1024ULL*1024ULL) /* 8MB */
70
9588bc32 71static int journal_file_set_online(JournalFile *f) {
26687bf8
OS
72 assert(f);
73
74 if (!f->writable)
75 return -EPERM;
76
77 if (!(f->fd >= 0 && f->header))
78 return -EINVAL;
79
80 switch(f->header->state) {
81 case STATE_ONLINE:
82 return 0;
83
84 case STATE_OFFLINE:
85 f->header->state = STATE_ONLINE;
86 fsync(f->fd);
87 return 0;
88
89 default:
90 return -EINVAL;
91 }
92}
93
94int journal_file_set_offline(JournalFile *f) {
95 assert(f);
96
97 if (!f->writable)
98 return -EPERM;
99
100 if (!(f->fd >= 0 && f->header))
101 return -EINVAL;
102
103 if (f->header->state != STATE_ONLINE)
104 return 0;
105
106 fsync(f->fd);
107
108 f->header->state = STATE_OFFLINE;
109
110 fsync(f->fd);
111
112 return 0;
113}
114
cec736d2 115void journal_file_close(JournalFile *f) {
de190aef 116 assert(f);
cec736d2 117
feb12d3e 118#ifdef HAVE_GCRYPT
b0af6f41 119 /* Write the final tag */
c586dbf1 120 if (f->seal && f->writable)
b0af6f41 121 journal_file_append_tag(f);
feb12d3e 122#endif
b0af6f41 123
7560fffc 124 /* Sync everything to disk, before we mark the file offline */
16e9f408
LP
125 if (f->mmap && f->fd >= 0)
126 mmap_cache_close_fd(f->mmap, f->fd);
7560fffc 127
26687bf8 128 journal_file_set_offline(f);
cec736d2 129
26687bf8 130 if (f->header)
d384c7a8 131 munmap(f->header, PAGE_ALIGN(sizeof(Header)));
cec736d2 132
03e334a1 133 safe_close(f->fd);
cec736d2 134 free(f->path);
807e17f0 135
16e9f408
LP
136 if (f->mmap)
137 mmap_cache_unref(f->mmap);
138
4743015d 139 ordered_hashmap_free_free(f->chain_cache);
a4bcff5b 140
d89c8fdf 141#if defined(HAVE_XZ) || defined(HAVE_LZ4)
807e17f0
LP
142 free(f->compress_buffer);
143#endif
144
7560fffc 145#ifdef HAVE_GCRYPT
baed47c3
LP
146 if (f->fss_file)
147 munmap(f->fss_file, PAGE_ALIGN(f->fss_file_size));
b7c9ae91
LP
148 else if (f->fsprg_state)
149 free(f->fsprg_state);
150
151 free(f->fsprg_seed);
7560fffc
LP
152
153 if (f->hmac)
154 gcry_md_close(f->hmac);
155#endif
156
cec736d2
LP
157 free(f);
158}
159
0ac38b70 160static int journal_file_init_header(JournalFile *f, JournalFile *template) {
d89c8fdf 161 Header h = {};
cec736d2
LP
162 ssize_t k;
163 int r;
164
165 assert(f);
166
7560fffc 167 memcpy(h.signature, HEADER_SIGNATURE, 8);
23b0b2b2 168 h.header_size = htole64(ALIGN64(sizeof(h)));
cec736d2 169
d89c8fdf
ZJS
170 h.incompatible_flags |= htole32(
171 f->compress_xz * HEADER_INCOMPATIBLE_COMPRESSED_XZ |
172 f->compress_lz4 * HEADER_INCOMPATIBLE_COMPRESSED_LZ4);
7560fffc 173
d89c8fdf
ZJS
174 h.compatible_flags = htole32(
175 f->seal * HEADER_COMPATIBLE_SEALED);
7560fffc 176
cec736d2
LP
177 r = sd_id128_randomize(&h.file_id);
178 if (r < 0)
179 return r;
180
0ac38b70
LP
181 if (template) {
182 h.seqnum_id = template->header->seqnum_id;
beec0085 183 h.tail_entry_seqnum = template->header->tail_entry_seqnum;
0ac38b70
LP
184 } else
185 h.seqnum_id = h.file_id;
cec736d2
LP
186
187 k = pwrite(f->fd, &h, sizeof(h), 0);
188 if (k < 0)
189 return -errno;
190
191 if (k != sizeof(h))
192 return -EIO;
193
194 return 0;
195}
196
197static int journal_file_refresh_header(JournalFile *f) {
198 int r;
de190aef 199 sd_id128_t boot_id;
cec736d2
LP
200
201 assert(f);
202
203 r = sd_id128_get_machine(&f->header->machine_id);
204 if (r < 0)
205 return r;
206
de190aef 207 r = sd_id128_get_boot(&boot_id);
cec736d2
LP
208 if (r < 0)
209 return r;
210
de190aef
LP
211 if (sd_id128_equal(boot_id, f->header->boot_id))
212 f->tail_entry_monotonic_valid = true;
213
214 f->header->boot_id = boot_id;
215
26687bf8 216 journal_file_set_online(f);
b788cc23 217
7560fffc 218 /* Sync the online state to disk */
a676e665 219 fsync(f->fd);
b788cc23 220
cec736d2
LP
221 return 0;
222}
223
224static int journal_file_verify_header(JournalFile *f) {
d89c8fdf
ZJS
225 uint32_t flags;
226
cec736d2
LP
227 assert(f);
228
7560fffc 229 if (memcmp(f->header->signature, HEADER_SIGNATURE, 8))
cec736d2
LP
230 return -EBADMSG;
231
7560fffc
LP
232 /* In both read and write mode we refuse to open files with
233 * incompatible flags we don't know */
d89c8fdf
ZJS
234 flags = le32toh(f->header->incompatible_flags);
235 if (flags & ~HEADER_INCOMPATIBLE_SUPPORTED) {
236 if (flags & ~HEADER_INCOMPATIBLE_ANY)
237 log_debug("Journal file %s has unknown incompatible flags %"PRIx32,
238 f->path, flags & ~HEADER_INCOMPATIBLE_ANY);
239 flags = (flags & HEADER_INCOMPATIBLE_ANY) & ~HEADER_INCOMPATIBLE_SUPPORTED;
240 if (flags)
241 log_debug("Journal file %s uses incompatible flags %"PRIx32
242 " disabled at compilation time.", f->path, flags);
cec736d2 243 return -EPROTONOSUPPORT;
d89c8fdf 244 }
cec736d2 245
7560fffc
LP
246 /* When open for writing we refuse to open files with
247 * compatible flags, too */
d89c8fdf
ZJS
248 flags = le32toh(f->header->compatible_flags);
249 if (f->writable && (flags & ~HEADER_COMPATIBLE_SUPPORTED)) {
250 if (flags & ~HEADER_COMPATIBLE_ANY)
251 log_debug("Journal file %s has unknown compatible flags %"PRIx32,
252 f->path, flags & ~HEADER_COMPATIBLE_ANY);
253 flags = (flags & HEADER_COMPATIBLE_ANY) & ~HEADER_COMPATIBLE_SUPPORTED;
254 if (flags)
255 log_debug("Journal file %s uses compatible flags %"PRIx32
256 " disabled at compilation time.", f->path, flags);
257 return -EPROTONOSUPPORT;
7560fffc
LP
258 }
259
db11ac1a
LP
260 if (f->header->state >= _STATE_MAX)
261 return -EBADMSG;
262
dca6219e
LP
263 /* The first addition was n_data, so check that we are at least this large */
264 if (le64toh(f->header->header_size) < HEADER_SIZE_MIN)
23b0b2b2
LP
265 return -EBADMSG;
266
8088cbd3 267 if (JOURNAL_HEADER_SEALED(f->header) && !JOURNAL_HEADER_CONTAINS(f->header, n_entry_arrays))
beec0085
LP
268 return -EBADMSG;
269
db11ac1a
LP
270 if ((le64toh(f->header->header_size) + le64toh(f->header->arena_size)) > (uint64_t) f->last_stat.st_size)
271 return -ENODATA;
272
273 if (le64toh(f->header->tail_object_offset) > (le64toh(f->header->header_size) + le64toh(f->header->arena_size)))
274 return -ENODATA;
275
7762e02b
LP
276 if (!VALID64(le64toh(f->header->data_hash_table_offset)) ||
277 !VALID64(le64toh(f->header->field_hash_table_offset)) ||
278 !VALID64(le64toh(f->header->tail_object_offset)) ||
279 !VALID64(le64toh(f->header->entry_array_offset)))
280 return -ENODATA;
281
cec736d2 282 if (f->writable) {
ccdbaf91 283 uint8_t state;
cec736d2
LP
284 sd_id128_t machine_id;
285 int r;
286
287 r = sd_id128_get_machine(&machine_id);
288 if (r < 0)
289 return r;
290
291 if (!sd_id128_equal(machine_id, f->header->machine_id))
292 return -EHOSTDOWN;
293
de190aef 294 state = f->header->state;
cec736d2 295
71fa6f00
LP
296 if (state == STATE_ONLINE) {
297 log_debug("Journal file %s is already online. Assuming unclean closing.", f->path);
298 return -EBUSY;
299 } else if (state == STATE_ARCHIVED)
cec736d2 300 return -ESHUTDOWN;
71fa6f00
LP
301 else if (state != STATE_OFFLINE) {
302 log_debug("Journal file %s has unknown state %u.", f->path, state);
303 return -EBUSY;
304 }
cec736d2
LP
305 }
306
d89c8fdf
ZJS
307 f->compress_xz = JOURNAL_HEADER_COMPRESSED_XZ(f->header);
308 f->compress_lz4 = JOURNAL_HEADER_COMPRESSED_LZ4(f->header);
c586dbf1 309
f1889c91 310 f->seal = JOURNAL_HEADER_SEALED(f->header);
7560fffc 311
cec736d2
LP
312 return 0;
313}
314
315static int journal_file_allocate(JournalFile *f, uint64_t offset, uint64_t size) {
eda4b58b 316 uint64_t old_size, new_size;
fec2aa2f 317 int r;
cec736d2
LP
318
319 assert(f);
320
cec736d2 321 /* We assume that this file is not sparse, and we know that
38ac38b2 322 * for sure, since we always call posix_fallocate()
cec736d2
LP
323 * ourselves */
324
325 old_size =
23b0b2b2 326 le64toh(f->header->header_size) +
cec736d2
LP
327 le64toh(f->header->arena_size);
328
bc85bfee 329 new_size = PAGE_ALIGN(offset + size);
23b0b2b2
LP
330 if (new_size < le64toh(f->header->header_size))
331 new_size = le64toh(f->header->header_size);
bc85bfee
LP
332
333 if (new_size <= old_size)
cec736d2
LP
334 return 0;
335
a676e665 336 if (f->metrics.max_size > 0 && new_size > f->metrics.max_size)
bc85bfee 337 return -E2BIG;
cec736d2 338
a676e665 339 if (new_size > f->metrics.min_size && f->metrics.keep_free > 0) {
cec736d2
LP
340 struct statvfs svfs;
341
342 if (fstatvfs(f->fd, &svfs) >= 0) {
343 uint64_t available;
344
345 available = svfs.f_bfree * svfs.f_bsize;
346
bc85bfee
LP
347 if (available >= f->metrics.keep_free)
348 available -= f->metrics.keep_free;
cec736d2
LP
349 else
350 available = 0;
351
352 if (new_size - old_size > available)
353 return -E2BIG;
354 }
355 }
356
eda4b58b
LP
357 /* Increase by larger blocks at once */
358 new_size = ((new_size+FILE_SIZE_INCREASE-1) / FILE_SIZE_INCREASE) * FILE_SIZE_INCREASE;
359 if (f->metrics.max_size > 0 && new_size > f->metrics.max_size)
360 new_size = f->metrics.max_size;
361
bc85bfee
LP
362 /* Note that the glibc fallocate() fallback is very
363 inefficient, hence we try to minimize the allocation area
364 as we can. */
fec2aa2f
GV
365 r = posix_fallocate(f->fd, old_size, new_size - old_size);
366 if (r != 0)
367 return -r;
cec736d2 368
eda4b58b
LP
369 if (fstat(f->fd, &f->last_stat) < 0)
370 return -errno;
cec736d2 371
23b0b2b2 372 f->header->arena_size = htole64(new_size - le64toh(f->header->header_size));
cec736d2
LP
373
374 return 0;
375}
376
78519831 377static unsigned type_to_context(ObjectType type) {
d3d3208f 378 /* One context for each type, plus one catch-all for the rest */
69adae51 379 assert_cc(_OBJECT_TYPE_MAX <= MMAP_CACHE_MAX_CONTEXTS);
d05089d8 380 return type > OBJECT_UNUSED && type < _OBJECT_TYPE_MAX ? type : 0;
d3d3208f
MS
381}
382
7a9dabea 383static int journal_file_move_to(JournalFile *f, ObjectType type, bool keep_always, uint64_t offset, uint64_t size, void **ret) {
cec736d2 384 assert(f);
cec736d2
LP
385 assert(ret);
386
7762e02b
LP
387 if (size <= 0)
388 return -EINVAL;
389
2a59ea54 390 /* Avoid SIGBUS on invalid accesses */
4bbdcdb3
LP
391 if (offset + size > (uint64_t) f->last_stat.st_size) {
392 /* Hmm, out of range? Let's refresh the fstat() data
393 * first, before we trust that check. */
394
395 if (fstat(f->fd, &f->last_stat) < 0 ||
396 offset + size > (uint64_t) f->last_stat.st_size)
397 return -EADDRNOTAVAIL;
398 }
399
7a9dabea 400 return mmap_cache_get(f->mmap, f->fd, f->prot, type_to_context(type), keep_always, offset, size, &f->last_stat, ret);
cec736d2
LP
401}
402
16e9f408
LP
403static uint64_t minimum_header_size(Object *o) {
404
b8e891e6 405 static const uint64_t table[] = {
16e9f408
LP
406 [OBJECT_DATA] = sizeof(DataObject),
407 [OBJECT_FIELD] = sizeof(FieldObject),
408 [OBJECT_ENTRY] = sizeof(EntryObject),
409 [OBJECT_DATA_HASH_TABLE] = sizeof(HashTableObject),
410 [OBJECT_FIELD_HASH_TABLE] = sizeof(HashTableObject),
411 [OBJECT_ENTRY_ARRAY] = sizeof(EntryArrayObject),
412 [OBJECT_TAG] = sizeof(TagObject),
413 };
414
415 if (o->object.type >= ELEMENTSOF(table) || table[o->object.type] <= 0)
416 return sizeof(ObjectHeader);
417
418 return table[o->object.type];
419}
420
78519831 421int journal_file_move_to_object(JournalFile *f, ObjectType type, uint64_t offset, Object **ret) {
cec736d2
LP
422 int r;
423 void *t;
424 Object *o;
425 uint64_t s;
426
427 assert(f);
428 assert(ret);
429
db11ac1a
LP
430 /* Objects may only be located at multiple of 64 bit */
431 if (!VALID64(offset))
432 return -EFAULT;
433
7a9dabea 434 r = journal_file_move_to(f, type, false, offset, sizeof(ObjectHeader), &t);
cec736d2
LP
435 if (r < 0)
436 return r;
437
438 o = (Object*) t;
439 s = le64toh(o->object.size);
440
441 if (s < sizeof(ObjectHeader))
442 return -EBADMSG;
443
16e9f408
LP
444 if (o->object.type <= OBJECT_UNUSED)
445 return -EBADMSG;
446
447 if (s < minimum_header_size(o))
448 return -EBADMSG;
449
d05089d8 450 if (type > OBJECT_UNUSED && o->object.type != type)
cec736d2
LP
451 return -EBADMSG;
452
453 if (s > sizeof(ObjectHeader)) {
7a9dabea 454 r = journal_file_move_to(f, type, false, offset, s, &t);
cec736d2
LP
455 if (r < 0)
456 return r;
457
458 o = (Object*) t;
459 }
460
cec736d2
LP
461 *ret = o;
462 return 0;
463}
464
d98cc1f2 465static uint64_t journal_file_entry_seqnum(JournalFile *f, uint64_t *seqnum) {
cec736d2
LP
466 uint64_t r;
467
468 assert(f);
469
beec0085 470 r = le64toh(f->header->tail_entry_seqnum) + 1;
c2373f84
LP
471
472 if (seqnum) {
de190aef 473 /* If an external seqnum counter was passed, we update
c2373f84
LP
474 * both the local and the external one, and set it to
475 * the maximum of both */
476
477 if (*seqnum + 1 > r)
478 r = *seqnum + 1;
479
480 *seqnum = r;
481 }
482
beec0085 483 f->header->tail_entry_seqnum = htole64(r);
cec736d2 484
beec0085
LP
485 if (f->header->head_entry_seqnum == 0)
486 f->header->head_entry_seqnum = htole64(r);
de190aef 487
cec736d2
LP
488 return r;
489}
490
78519831 491int journal_file_append_object(JournalFile *f, ObjectType type, uint64_t size, Object **ret, uint64_t *offset) {
cec736d2
LP
492 int r;
493 uint64_t p;
494 Object *tail, *o;
495 void *t;
496
497 assert(f);
d05089d8 498 assert(type > OBJECT_UNUSED && type < _OBJECT_TYPE_MAX);
cec736d2
LP
499 assert(size >= sizeof(ObjectHeader));
500 assert(offset);
501 assert(ret);
502
26687bf8
OS
503 r = journal_file_set_online(f);
504 if (r < 0)
505 return r;
506
cec736d2 507 p = le64toh(f->header->tail_object_offset);
cec736d2 508 if (p == 0)
23b0b2b2 509 p = le64toh(f->header->header_size);
cec736d2 510 else {
d05089d8 511 r = journal_file_move_to_object(f, OBJECT_UNUSED, p, &tail);
cec736d2
LP
512 if (r < 0)
513 return r;
514
515 p += ALIGN64(le64toh(tail->object.size));
516 }
517
518 r = journal_file_allocate(f, p, size);
519 if (r < 0)
520 return r;
521
fcde2389 522 r = journal_file_move_to(f, type, false, p, size, &t);
cec736d2
LP
523 if (r < 0)
524 return r;
525
526 o = (Object*) t;
527
528 zero(o->object);
de190aef 529 o->object.type = type;
cec736d2
LP
530 o->object.size = htole64(size);
531
532 f->header->tail_object_offset = htole64(p);
cec736d2
LP
533 f->header->n_objects = htole64(le64toh(f->header->n_objects) + 1);
534
535 *ret = o;
536 *offset = p;
537
538 return 0;
539}
540
de190aef 541static int journal_file_setup_data_hash_table(JournalFile *f) {
cec736d2
LP
542 uint64_t s, p;
543 Object *o;
544 int r;
545
546 assert(f);
547
dfabe643 548 /* We estimate that we need 1 hash table entry per 768 of
4a92baf3
LP
549 journal file and we want to make sure we never get beyond
550 75% fill level. Calculate the hash table size for the
551 maximum file size based on these metrics. */
552
dfabe643 553 s = (f->metrics.max_size * 4 / 768 / 3) * sizeof(HashItem);
4a92baf3
LP
554 if (s < DEFAULT_DATA_HASH_TABLE_SIZE)
555 s = DEFAULT_DATA_HASH_TABLE_SIZE;
556
507f22bd 557 log_debug("Reserving %"PRIu64" entries in hash table.", s / sizeof(HashItem));
4a92baf3 558
de190aef
LP
559 r = journal_file_append_object(f,
560 OBJECT_DATA_HASH_TABLE,
561 offsetof(Object, hash_table.items) + s,
562 &o, &p);
cec736d2
LP
563 if (r < 0)
564 return r;
565
29804cc1 566 memzero(o->hash_table.items, s);
cec736d2 567
de190aef
LP
568 f->header->data_hash_table_offset = htole64(p + offsetof(Object, hash_table.items));
569 f->header->data_hash_table_size = htole64(s);
cec736d2
LP
570
571 return 0;
572}
573
de190aef 574static int journal_file_setup_field_hash_table(JournalFile *f) {
cec736d2
LP
575 uint64_t s, p;
576 Object *o;
577 int r;
578
579 assert(f);
580
3c1668da
LP
581 /* We use a fixed size hash table for the fields as this
582 * number should grow very slowly only */
583
de190aef
LP
584 s = DEFAULT_FIELD_HASH_TABLE_SIZE;
585 r = journal_file_append_object(f,
586 OBJECT_FIELD_HASH_TABLE,
587 offsetof(Object, hash_table.items) + s,
588 &o, &p);
cec736d2
LP
589 if (r < 0)
590 return r;
591
29804cc1 592 memzero(o->hash_table.items, s);
cec736d2 593
de190aef
LP
594 f->header->field_hash_table_offset = htole64(p + offsetof(Object, hash_table.items));
595 f->header->field_hash_table_size = htole64(s);
cec736d2
LP
596
597 return 0;
598}
599
de190aef 600static int journal_file_map_data_hash_table(JournalFile *f) {
cec736d2
LP
601 uint64_t s, p;
602 void *t;
603 int r;
604
605 assert(f);
606
de190aef
LP
607 p = le64toh(f->header->data_hash_table_offset);
608 s = le64toh(f->header->data_hash_table_size);
cec736d2 609
de190aef 610 r = journal_file_move_to(f,
16e9f408 611 OBJECT_DATA_HASH_TABLE,
fcde2389 612 true,
de190aef
LP
613 p, s,
614 &t);
cec736d2
LP
615 if (r < 0)
616 return r;
617
de190aef 618 f->data_hash_table = t;
cec736d2
LP
619 return 0;
620}
621
de190aef 622static int journal_file_map_field_hash_table(JournalFile *f) {
cec736d2
LP
623 uint64_t s, p;
624 void *t;
625 int r;
626
627 assert(f);
628
de190aef
LP
629 p = le64toh(f->header->field_hash_table_offset);
630 s = le64toh(f->header->field_hash_table_size);
cec736d2 631
de190aef 632 r = journal_file_move_to(f,
16e9f408 633 OBJECT_FIELD_HASH_TABLE,
fcde2389 634 true,
de190aef
LP
635 p, s,
636 &t);
cec736d2
LP
637 if (r < 0)
638 return r;
639
de190aef 640 f->field_hash_table = t;
cec736d2
LP
641 return 0;
642}
643
3c1668da
LP
644static int journal_file_link_field(
645 JournalFile *f,
646 Object *o,
647 uint64_t offset,
648 uint64_t hash) {
649
650 uint64_t p, h;
651 int r;
652
653 assert(f);
654 assert(o);
655 assert(offset > 0);
656
657 if (o->object.type != OBJECT_FIELD)
658 return -EINVAL;
659
660 /* This might alter the window we are looking at */
661
662 o->field.next_hash_offset = o->field.head_data_offset = 0;
663
664 h = hash % (le64toh(f->header->field_hash_table_size) / sizeof(HashItem));
665 p = le64toh(f->field_hash_table[h].tail_hash_offset);
666 if (p == 0)
667 f->field_hash_table[h].head_hash_offset = htole64(offset);
668 else {
669 r = journal_file_move_to_object(f, OBJECT_FIELD, p, &o);
670 if (r < 0)
671 return r;
672
673 o->field.next_hash_offset = htole64(offset);
674 }
675
676 f->field_hash_table[h].tail_hash_offset = htole64(offset);
677
678 if (JOURNAL_HEADER_CONTAINS(f->header, n_fields))
679 f->header->n_fields = htole64(le64toh(f->header->n_fields) + 1);
680
681 return 0;
682}
683
684static int journal_file_link_data(
685 JournalFile *f,
686 Object *o,
687 uint64_t offset,
688 uint64_t hash) {
689
de190aef 690 uint64_t p, h;
cec736d2
LP
691 int r;
692
693 assert(f);
694 assert(o);
695 assert(offset > 0);
b588975f
LP
696
697 if (o->object.type != OBJECT_DATA)
698 return -EINVAL;
cec736d2 699
48496df6
LP
700 /* This might alter the window we are looking at */
701
de190aef
LP
702 o->data.next_hash_offset = o->data.next_field_offset = 0;
703 o->data.entry_offset = o->data.entry_array_offset = 0;
704 o->data.n_entries = 0;
cec736d2 705
de190aef 706 h = hash % (le64toh(f->header->data_hash_table_size) / sizeof(HashItem));
8db4213e 707 p = le64toh(f->data_hash_table[h].tail_hash_offset);
3c1668da 708 if (p == 0)
cec736d2 709 /* Only entry in the hash table is easy */
de190aef 710 f->data_hash_table[h].head_hash_offset = htole64(offset);
3c1668da 711 else {
48496df6
LP
712 /* Move back to the previous data object, to patch in
713 * pointer */
cec736d2 714
de190aef 715 r = journal_file_move_to_object(f, OBJECT_DATA, p, &o);
cec736d2
LP
716 if (r < 0)
717 return r;
718
de190aef 719 o->data.next_hash_offset = htole64(offset);
cec736d2
LP
720 }
721
de190aef 722 f->data_hash_table[h].tail_hash_offset = htole64(offset);
cec736d2 723
dca6219e
LP
724 if (JOURNAL_HEADER_CONTAINS(f->header, n_data))
725 f->header->n_data = htole64(le64toh(f->header->n_data) + 1);
726
cec736d2
LP
727 return 0;
728}
729
3c1668da
LP
730int journal_file_find_field_object_with_hash(
731 JournalFile *f,
732 const void *field, uint64_t size, uint64_t hash,
733 Object **ret, uint64_t *offset) {
734
735 uint64_t p, osize, h;
736 int r;
737
738 assert(f);
739 assert(field && size > 0);
740
741 osize = offsetof(Object, field.payload) + size;
742
743 if (f->header->field_hash_table_size == 0)
744 return -EBADMSG;
745
746 h = hash % (le64toh(f->header->field_hash_table_size) / sizeof(HashItem));
747 p = le64toh(f->field_hash_table[h].head_hash_offset);
748
749 while (p > 0) {
750 Object *o;
751
752 r = journal_file_move_to_object(f, OBJECT_FIELD, p, &o);
753 if (r < 0)
754 return r;
755
756 if (le64toh(o->field.hash) == hash &&
757 le64toh(o->object.size) == osize &&
758 memcmp(o->field.payload, field, size) == 0) {
759
760 if (ret)
761 *ret = o;
762 if (offset)
763 *offset = p;
764
765 return 1;
766 }
767
768 p = le64toh(o->field.next_hash_offset);
769 }
770
771 return 0;
772}
773
774int journal_file_find_field_object(
775 JournalFile *f,
776 const void *field, uint64_t size,
777 Object **ret, uint64_t *offset) {
778
779 uint64_t hash;
780
781 assert(f);
782 assert(field && size > 0);
783
784 hash = hash64(field, size);
785
786 return journal_file_find_field_object_with_hash(f,
787 field, size, hash,
788 ret, offset);
789}
790
de190aef
LP
791int journal_file_find_data_object_with_hash(
792 JournalFile *f,
793 const void *data, uint64_t size, uint64_t hash,
794 Object **ret, uint64_t *offset) {
48496df6 795
de190aef 796 uint64_t p, osize, h;
cec736d2
LP
797 int r;
798
799 assert(f);
800 assert(data || size == 0);
801
802 osize = offsetof(Object, data.payload) + size;
803
bc85bfee
LP
804 if (f->header->data_hash_table_size == 0)
805 return -EBADMSG;
806
de190aef
LP
807 h = hash % (le64toh(f->header->data_hash_table_size) / sizeof(HashItem));
808 p = le64toh(f->data_hash_table[h].head_hash_offset);
cec736d2 809
de190aef
LP
810 while (p > 0) {
811 Object *o;
cec736d2 812
de190aef 813 r = journal_file_move_to_object(f, OBJECT_DATA, p, &o);
cec736d2
LP
814 if (r < 0)
815 return r;
816
807e17f0 817 if (le64toh(o->data.hash) != hash)
85a131e8 818 goto next;
807e17f0 819
d89c8fdf 820 if (o->object.flags & OBJECT_COMPRESSION_MASK) {
3b1a55e1 821#if defined(HAVE_XZ) || defined(HAVE_LZ4)
fa1c4b51
ZJS
822 uint64_t l;
823 size_t rsize;
cec736d2 824
807e17f0
LP
825 l = le64toh(o->object.size);
826 if (l <= offsetof(Object, data.payload))
cec736d2
LP
827 return -EBADMSG;
828
807e17f0
LP
829 l -= offsetof(Object, data.payload);
830
d89c8fdf
ZJS
831 r = decompress_blob(o->object.flags & OBJECT_COMPRESSION_MASK,
832 o->data.payload, l, &f->compress_buffer, &f->compress_buffer_size, &rsize, 0);
833 if (r < 0)
834 return r;
807e17f0 835
b785c858 836 if (rsize == size &&
807e17f0
LP
837 memcmp(f->compress_buffer, data, size) == 0) {
838
839 if (ret)
840 *ret = o;
841
842 if (offset)
843 *offset = p;
844
845 return 1;
846 }
3b1a55e1
ZJS
847#else
848 return -EPROTONOSUPPORT;
849#endif
807e17f0
LP
850 } else if (le64toh(o->object.size) == osize &&
851 memcmp(o->data.payload, data, size) == 0) {
852
cec736d2
LP
853 if (ret)
854 *ret = o;
855
856 if (offset)
857 *offset = p;
858
de190aef 859 return 1;
cec736d2
LP
860 }
861
85a131e8 862 next:
cec736d2
LP
863 p = le64toh(o->data.next_hash_offset);
864 }
865
de190aef
LP
866 return 0;
867}
868
869int journal_file_find_data_object(
870 JournalFile *f,
871 const void *data, uint64_t size,
872 Object **ret, uint64_t *offset) {
873
874 uint64_t hash;
875
876 assert(f);
877 assert(data || size == 0);
878
879 hash = hash64(data, size);
880
881 return journal_file_find_data_object_with_hash(f,
882 data, size, hash,
883 ret, offset);
884}
885
3c1668da
LP
886static int journal_file_append_field(
887 JournalFile *f,
888 const void *field, uint64_t size,
889 Object **ret, uint64_t *offset) {
890
891 uint64_t hash, p;
892 uint64_t osize;
893 Object *o;
894 int r;
895
896 assert(f);
897 assert(field && size > 0);
898
899 hash = hash64(field, size);
900
901 r = journal_file_find_field_object_with_hash(f, field, size, hash, &o, &p);
902 if (r < 0)
903 return r;
904 else if (r > 0) {
905
906 if (ret)
907 *ret = o;
908
909 if (offset)
910 *offset = p;
911
912 return 0;
913 }
914
915 osize = offsetof(Object, field.payload) + size;
916 r = journal_file_append_object(f, OBJECT_FIELD, osize, &o, &p);
8c92d4bb
LP
917 if (r < 0)
918 return r;
3c1668da
LP
919
920 o->field.hash = htole64(hash);
921 memcpy(o->field.payload, field, size);
922
923 r = journal_file_link_field(f, o, p, hash);
924 if (r < 0)
925 return r;
926
927 /* The linking might have altered the window, so let's
928 * refresh our pointer */
929 r = journal_file_move_to_object(f, OBJECT_FIELD, p, &o);
930 if (r < 0)
931 return r;
932
933#ifdef HAVE_GCRYPT
934 r = journal_file_hmac_put_object(f, OBJECT_FIELD, o, p);
935 if (r < 0)
936 return r;
937#endif
938
939 if (ret)
940 *ret = o;
941
942 if (offset)
943 *offset = p;
944
945 return 0;
946}
947
48496df6
LP
948static int journal_file_append_data(
949 JournalFile *f,
950 const void *data, uint64_t size,
951 Object **ret, uint64_t *offset) {
952
de190aef
LP
953 uint64_t hash, p;
954 uint64_t osize;
955 Object *o;
d89c8fdf 956 int r, compression = 0;
3c1668da 957 const void *eq;
de190aef
LP
958
959 assert(f);
960 assert(data || size == 0);
961
962 hash = hash64(data, size);
963
964 r = journal_file_find_data_object_with_hash(f, data, size, hash, &o, &p);
965 if (r < 0)
966 return r;
967 else if (r > 0) {
968
969 if (ret)
970 *ret = o;
971
972 if (offset)
973 *offset = p;
974
975 return 0;
976 }
977
978 osize = offsetof(Object, data.payload) + size;
979 r = journal_file_append_object(f, OBJECT_DATA, osize, &o, &p);
cec736d2
LP
980 if (r < 0)
981 return r;
982
cec736d2 983 o->data.hash = htole64(hash);
807e17f0 984
d89c8fdf
ZJS
985#if defined(HAVE_XZ) || defined(HAVE_LZ4)
986 if (f->compress_xz &&
807e17f0 987 size >= COMPRESSION_SIZE_THRESHOLD) {
fa1c4b51 988 size_t rsize;
807e17f0 989
d89c8fdf 990 compression = compress_blob(data, size, o->data.payload, &rsize);
807e17f0 991
d89c8fdf 992 if (compression) {
807e17f0 993 o->object.size = htole64(offsetof(Object, data.payload) + rsize);
d89c8fdf 994 o->object.flags |= compression;
807e17f0 995
fa1c4b51 996 log_debug("Compressed data object %"PRIu64" -> %zu using %s",
d89c8fdf 997 size, rsize, object_compressed_to_string(compression));
807e17f0
LP
998 }
999 }
1000#endif
1001
d89c8fdf 1002 if (!compression && size > 0)
807e17f0 1003 memcpy(o->data.payload, data, size);
cec736d2 1004
de190aef 1005 r = journal_file_link_data(f, o, p, hash);
cec736d2
LP
1006 if (r < 0)
1007 return r;
1008
48496df6
LP
1009 /* The linking might have altered the window, so let's
1010 * refresh our pointer */
1011 r = journal_file_move_to_object(f, OBJECT_DATA, p, &o);
1012 if (r < 0)
1013 return r;
1014
08c6f819
SL
1015 if (!data)
1016 eq = NULL;
1017 else
1018 eq = memchr(data, '=', size);
3c1668da 1019 if (eq && eq > data) {
748db592 1020 Object *fo = NULL;
3c1668da 1021 uint64_t fp;
3c1668da
LP
1022
1023 /* Create field object ... */
1024 r = journal_file_append_field(f, data, (uint8_t*) eq - (uint8_t*) data, &fo, &fp);
1025 if (r < 0)
1026 return r;
1027
1028 /* ... and link it in. */
1029 o->data.next_field_offset = fo->field.head_data_offset;
1030 fo->field.head_data_offset = le64toh(p);
1031 }
1032
5996c7c2
LP
1033#ifdef HAVE_GCRYPT
1034 r = journal_file_hmac_put_object(f, OBJECT_DATA, o, p);
1035 if (r < 0)
1036 return r;
1037#endif
1038
cec736d2
LP
1039 if (ret)
1040 *ret = o;
1041
1042 if (offset)
de190aef 1043 *offset = p;
cec736d2
LP
1044
1045 return 0;
1046}
1047
1048uint64_t journal_file_entry_n_items(Object *o) {
1049 assert(o);
b588975f
LP
1050
1051 if (o->object.type != OBJECT_ENTRY)
1052 return 0;
cec736d2
LP
1053
1054 return (le64toh(o->object.size) - offsetof(Object, entry.items)) / sizeof(EntryItem);
1055}
1056
0284adc6 1057uint64_t journal_file_entry_array_n_items(Object *o) {
de190aef 1058 assert(o);
b588975f
LP
1059
1060 if (o->object.type != OBJECT_ENTRY_ARRAY)
1061 return 0;
de190aef
LP
1062
1063 return (le64toh(o->object.size) - offsetof(Object, entry_array.items)) / sizeof(uint64_t);
1064}
1065
fb9a24b6
LP
1066uint64_t journal_file_hash_table_n_items(Object *o) {
1067 assert(o);
b588975f
LP
1068
1069 if (o->object.type != OBJECT_DATA_HASH_TABLE &&
1070 o->object.type != OBJECT_FIELD_HASH_TABLE)
1071 return 0;
fb9a24b6
LP
1072
1073 return (le64toh(o->object.size) - offsetof(Object, hash_table.items)) / sizeof(HashItem);
1074}
1075
de190aef 1076static int link_entry_into_array(JournalFile *f,
4fd052ae
FC
1077 le64_t *first,
1078 le64_t *idx,
de190aef 1079 uint64_t p) {
cec736d2 1080 int r;
de190aef
LP
1081 uint64_t n = 0, ap = 0, q, i, a, hidx;
1082 Object *o;
1083
cec736d2 1084 assert(f);
de190aef
LP
1085 assert(first);
1086 assert(idx);
1087 assert(p > 0);
cec736d2 1088
de190aef
LP
1089 a = le64toh(*first);
1090 i = hidx = le64toh(*idx);
1091 while (a > 0) {
1092
1093 r = journal_file_move_to_object(f, OBJECT_ENTRY_ARRAY, a, &o);
1094 if (r < 0)
1095 return r;
cec736d2 1096
de190aef
LP
1097 n = journal_file_entry_array_n_items(o);
1098 if (i < n) {
1099 o->entry_array.items[i] = htole64(p);
1100 *idx = htole64(hidx + 1);
1101 return 0;
1102 }
cec736d2 1103
de190aef
LP
1104 i -= n;
1105 ap = a;
1106 a = le64toh(o->entry_array.next_entry_array_offset);
1107 }
1108
1109 if (hidx > n)
1110 n = (hidx+1) * 2;
1111 else
1112 n = n * 2;
1113
1114 if (n < 4)
1115 n = 4;
1116
1117 r = journal_file_append_object(f, OBJECT_ENTRY_ARRAY,
1118 offsetof(Object, entry_array.items) + n * sizeof(uint64_t),
1119 &o, &q);
cec736d2
LP
1120 if (r < 0)
1121 return r;
1122
feb12d3e 1123#ifdef HAVE_GCRYPT
5996c7c2 1124 r = journal_file_hmac_put_object(f, OBJECT_ENTRY_ARRAY, o, q);
b0af6f41
LP
1125 if (r < 0)
1126 return r;
feb12d3e 1127#endif
b0af6f41 1128
de190aef 1129 o->entry_array.items[i] = htole64(p);
cec736d2 1130
de190aef 1131 if (ap == 0)
7be3aa17 1132 *first = htole64(q);
cec736d2 1133 else {
de190aef 1134 r = journal_file_move_to_object(f, OBJECT_ENTRY_ARRAY, ap, &o);
cec736d2
LP
1135 if (r < 0)
1136 return r;
1137
de190aef
LP
1138 o->entry_array.next_entry_array_offset = htole64(q);
1139 }
cec736d2 1140
2dee23eb
LP
1141 if (JOURNAL_HEADER_CONTAINS(f->header, n_entry_arrays))
1142 f->header->n_entry_arrays = htole64(le64toh(f->header->n_entry_arrays) + 1);
1143
de190aef
LP
1144 *idx = htole64(hidx + 1);
1145
1146 return 0;
1147}
cec736d2 1148
de190aef 1149static int link_entry_into_array_plus_one(JournalFile *f,
4fd052ae
FC
1150 le64_t *extra,
1151 le64_t *first,
1152 le64_t *idx,
de190aef
LP
1153 uint64_t p) {
1154
1155 int r;
1156
1157 assert(f);
1158 assert(extra);
1159 assert(first);
1160 assert(idx);
1161 assert(p > 0);
1162
1163 if (*idx == 0)
1164 *extra = htole64(p);
1165 else {
4fd052ae 1166 le64_t i;
de190aef 1167
7be3aa17 1168 i = htole64(le64toh(*idx) - 1);
de190aef
LP
1169 r = link_entry_into_array(f, first, &i, p);
1170 if (r < 0)
1171 return r;
cec736d2
LP
1172 }
1173
de190aef
LP
1174 *idx = htole64(le64toh(*idx) + 1);
1175 return 0;
1176}
1177
1178static int journal_file_link_entry_item(JournalFile *f, Object *o, uint64_t offset, uint64_t i) {
1179 uint64_t p;
1180 int r;
1181 assert(f);
1182 assert(o);
1183 assert(offset > 0);
1184
1185 p = le64toh(o->entry.items[i].object_offset);
1186 if (p == 0)
1187 return -EINVAL;
1188
1189 r = journal_file_move_to_object(f, OBJECT_DATA, p, &o);
cec736d2
LP
1190 if (r < 0)
1191 return r;
1192
de190aef
LP
1193 return link_entry_into_array_plus_one(f,
1194 &o->data.entry_offset,
1195 &o->data.entry_array_offset,
1196 &o->data.n_entries,
1197 offset);
cec736d2
LP
1198}
1199
1200static int journal_file_link_entry(JournalFile *f, Object *o, uint64_t offset) {
de190aef 1201 uint64_t n, i;
cec736d2
LP
1202 int r;
1203
1204 assert(f);
1205 assert(o);
1206 assert(offset > 0);
b588975f
LP
1207
1208 if (o->object.type != OBJECT_ENTRY)
1209 return -EINVAL;
cec736d2 1210
b788cc23
LP
1211 __sync_synchronize();
1212
cec736d2 1213 /* Link up the entry itself */
de190aef
LP
1214 r = link_entry_into_array(f,
1215 &f->header->entry_array_offset,
1216 &f->header->n_entries,
1217 offset);
1218 if (r < 0)
1219 return r;
cec736d2 1220
507f22bd 1221 /* log_debug("=> %s seqnr=%"PRIu64" n_entries=%"PRIu64, f->path, o->entry.seqnum, f->header->n_entries); */
cec736d2 1222
de190aef 1223 if (f->header->head_entry_realtime == 0)
0ac38b70 1224 f->header->head_entry_realtime = o->entry.realtime;
cec736d2 1225
0ac38b70 1226 f->header->tail_entry_realtime = o->entry.realtime;
de190aef
LP
1227 f->header->tail_entry_monotonic = o->entry.monotonic;
1228
1229 f->tail_entry_monotonic_valid = true;
cec736d2
LP
1230
1231 /* Link up the items */
1232 n = journal_file_entry_n_items(o);
1233 for (i = 0; i < n; i++) {
1234 r = journal_file_link_entry_item(f, o, offset, i);
1235 if (r < 0)
1236 return r;
1237 }
1238
cec736d2
LP
1239 return 0;
1240}
1241
1242static int journal_file_append_entry_internal(
1243 JournalFile *f,
1244 const dual_timestamp *ts,
1245 uint64_t xor_hash,
1246 const EntryItem items[], unsigned n_items,
de190aef 1247 uint64_t *seqnum,
cec736d2
LP
1248 Object **ret, uint64_t *offset) {
1249 uint64_t np;
1250 uint64_t osize;
1251 Object *o;
1252 int r;
1253
1254 assert(f);
1255 assert(items || n_items == 0);
de190aef 1256 assert(ts);
cec736d2
LP
1257
1258 osize = offsetof(Object, entry.items) + (n_items * sizeof(EntryItem));
1259
de190aef 1260 r = journal_file_append_object(f, OBJECT_ENTRY, osize, &o, &np);
cec736d2
LP
1261 if (r < 0)
1262 return r;
1263
d98cc1f2 1264 o->entry.seqnum = htole64(journal_file_entry_seqnum(f, seqnum));
cec736d2 1265 memcpy(o->entry.items, items, n_items * sizeof(EntryItem));
de190aef
LP
1266 o->entry.realtime = htole64(ts->realtime);
1267 o->entry.monotonic = htole64(ts->monotonic);
cec736d2
LP
1268 o->entry.xor_hash = htole64(xor_hash);
1269 o->entry.boot_id = f->header->boot_id;
1270
feb12d3e 1271#ifdef HAVE_GCRYPT
5996c7c2 1272 r = journal_file_hmac_put_object(f, OBJECT_ENTRY, o, np);
b0af6f41
LP
1273 if (r < 0)
1274 return r;
feb12d3e 1275#endif
b0af6f41 1276
cec736d2
LP
1277 r = journal_file_link_entry(f, o, np);
1278 if (r < 0)
1279 return r;
1280
1281 if (ret)
1282 *ret = o;
1283
1284 if (offset)
1285 *offset = np;
1286
1287 return 0;
1288}
1289
cf244689 1290void journal_file_post_change(JournalFile *f) {
50f20cfd
LP
1291 assert(f);
1292
1293 /* inotify() does not receive IN_MODIFY events from file
1294 * accesses done via mmap(). After each access we hence
1295 * trigger IN_MODIFY by truncating the journal file to its
1296 * current size which triggers IN_MODIFY. */
1297
bc85bfee
LP
1298 __sync_synchronize();
1299
50f20cfd 1300 if (ftruncate(f->fd, f->last_stat.st_size) < 0)
56f64d95 1301 log_error_errno(errno, "Failed to truncate file to its own size: %m");
50f20cfd
LP
1302}
1303
1f2da9ec
LP
1304static int entry_item_cmp(const void *_a, const void *_b) {
1305 const EntryItem *a = _a, *b = _b;
1306
1307 if (le64toh(a->object_offset) < le64toh(b->object_offset))
1308 return -1;
1309 if (le64toh(a->object_offset) > le64toh(b->object_offset))
1310 return 1;
1311 return 0;
1312}
1313
de190aef 1314int journal_file_append_entry(JournalFile *f, const dual_timestamp *ts, const struct iovec iovec[], unsigned n_iovec, uint64_t *seqnum, Object **ret, uint64_t *offset) {
cec736d2
LP
1315 unsigned i;
1316 EntryItem *items;
1317 int r;
1318 uint64_t xor_hash = 0;
de190aef 1319 struct dual_timestamp _ts;
cec736d2
LP
1320
1321 assert(f);
1322 assert(iovec || n_iovec == 0);
1323
de190aef
LP
1324 if (!ts) {
1325 dual_timestamp_get(&_ts);
1326 ts = &_ts;
1327 }
1328
1329 if (f->tail_entry_monotonic_valid &&
1330 ts->monotonic < le64toh(f->header->tail_entry_monotonic))
1331 return -EINVAL;
1332
feb12d3e 1333#ifdef HAVE_GCRYPT
7560fffc
LP
1334 r = journal_file_maybe_append_tag(f, ts->realtime);
1335 if (r < 0)
1336 return r;
feb12d3e 1337#endif
7560fffc 1338
64825d3c 1339 /* alloca() can't take 0, hence let's allocate at least one */
9607d947 1340 items = alloca(sizeof(EntryItem) * MAX(1u, n_iovec));
cec736d2
LP
1341
1342 for (i = 0; i < n_iovec; i++) {
1343 uint64_t p;
1344 Object *o;
1345
1346 r = journal_file_append_data(f, iovec[i].iov_base, iovec[i].iov_len, &o, &p);
1347 if (r < 0)
cf244689 1348 return r;
cec736d2
LP
1349
1350 xor_hash ^= le64toh(o->data.hash);
1351 items[i].object_offset = htole64(p);
de7b95cd 1352 items[i].hash = o->data.hash;
cec736d2
LP
1353 }
1354
1f2da9ec
LP
1355 /* Order by the position on disk, in order to improve seek
1356 * times for rotating media. */
7ff7394d 1357 qsort_safe(items, n_iovec, sizeof(EntryItem), entry_item_cmp);
1f2da9ec 1358
de190aef 1359 r = journal_file_append_entry_internal(f, ts, xor_hash, items, n_iovec, seqnum, ret, offset);
cec736d2 1360
50f20cfd
LP
1361 journal_file_post_change(f);
1362
cec736d2
LP
1363 return r;
1364}
1365
a4bcff5b 1366typedef struct ChainCacheItem {
fb099c8d 1367 uint64_t first; /* the array at the beginning of the chain */
a4bcff5b
LP
1368 uint64_t array; /* the cached array */
1369 uint64_t begin; /* the first item in the cached array */
1370 uint64_t total; /* the total number of items in all arrays before this one in the chain */
f268980d 1371 uint64_t last_index; /* the last index we looked at, to optimize locality when bisecting */
a4bcff5b
LP
1372} ChainCacheItem;
1373
1374static void chain_cache_put(
4743015d 1375 OrderedHashmap *h,
a4bcff5b
LP
1376 ChainCacheItem *ci,
1377 uint64_t first,
1378 uint64_t array,
1379 uint64_t begin,
f268980d
LP
1380 uint64_t total,
1381 uint64_t last_index) {
a4bcff5b
LP
1382
1383 if (!ci) {
34741aa3
LP
1384 /* If the chain item to cache for this chain is the
1385 * first one it's not worth caching anything */
1386 if (array == first)
1387 return;
1388
29433089 1389 if (ordered_hashmap_size(h) >= CHAIN_CACHE_MAX) {
4743015d 1390 ci = ordered_hashmap_steal_first(h);
29433089
LP
1391 assert(ci);
1392 } else {
a4bcff5b
LP
1393 ci = new(ChainCacheItem, 1);
1394 if (!ci)
1395 return;
1396 }
1397
1398 ci->first = first;
1399
4743015d 1400 if (ordered_hashmap_put(h, &ci->first, ci) < 0) {
a4bcff5b
LP
1401 free(ci);
1402 return;
1403 }
1404 } else
1405 assert(ci->first == first);
1406
1407 ci->array = array;
1408 ci->begin = begin;
1409 ci->total = total;
f268980d 1410 ci->last_index = last_index;
a4bcff5b
LP
1411}
1412
f268980d
LP
1413static int generic_array_get(
1414 JournalFile *f,
1415 uint64_t first,
1416 uint64_t i,
1417 Object **ret, uint64_t *offset) {
de190aef 1418
cec736d2 1419 Object *o;
a4bcff5b 1420 uint64_t p = 0, a, t = 0;
cec736d2 1421 int r;
a4bcff5b 1422 ChainCacheItem *ci;
cec736d2
LP
1423
1424 assert(f);
1425
de190aef 1426 a = first;
a4bcff5b
LP
1427
1428 /* Try the chain cache first */
4743015d 1429 ci = ordered_hashmap_get(f->chain_cache, &first);
a4bcff5b
LP
1430 if (ci && i > ci->total) {
1431 a = ci->array;
1432 i -= ci->total;
1433 t = ci->total;
1434 }
1435
de190aef 1436 while (a > 0) {
a4bcff5b 1437 uint64_t k;
cec736d2 1438
de190aef
LP
1439 r = journal_file_move_to_object(f, OBJECT_ENTRY_ARRAY, a, &o);
1440 if (r < 0)
1441 return r;
cec736d2 1442
a4bcff5b
LP
1443 k = journal_file_entry_array_n_items(o);
1444 if (i < k) {
de190aef 1445 p = le64toh(o->entry_array.items[i]);
a4bcff5b 1446 goto found;
cec736d2
LP
1447 }
1448
a4bcff5b
LP
1449 i -= k;
1450 t += k;
de190aef
LP
1451 a = le64toh(o->entry_array.next_entry_array_offset);
1452 }
1453
a4bcff5b
LP
1454 return 0;
1455
1456found:
1457 /* Let's cache this item for the next invocation */
af13a6b0 1458 chain_cache_put(f->chain_cache, ci, first, a, le64toh(o->entry_array.items[0]), t, i);
de190aef
LP
1459
1460 r = journal_file_move_to_object(f, OBJECT_ENTRY, p, &o);
1461 if (r < 0)
1462 return r;
1463
1464 if (ret)
1465 *ret = o;
1466
1467 if (offset)
1468 *offset = p;
1469
1470 return 1;
1471}
1472
f268980d
LP
1473static int generic_array_get_plus_one(
1474 JournalFile *f,
1475 uint64_t extra,
1476 uint64_t first,
1477 uint64_t i,
1478 Object **ret, uint64_t *offset) {
de190aef
LP
1479
1480 Object *o;
1481
1482 assert(f);
1483
1484 if (i == 0) {
1485 int r;
1486
1487 r = journal_file_move_to_object(f, OBJECT_ENTRY, extra, &o);
cec736d2
LP
1488 if (r < 0)
1489 return r;
1490
de190aef
LP
1491 if (ret)
1492 *ret = o;
cec736d2 1493
de190aef
LP
1494 if (offset)
1495 *offset = extra;
cec736d2 1496
de190aef 1497 return 1;
cec736d2
LP
1498 }
1499
de190aef
LP
1500 return generic_array_get(f, first, i-1, ret, offset);
1501}
cec736d2 1502
de190aef
LP
1503enum {
1504 TEST_FOUND,
1505 TEST_LEFT,
1506 TEST_RIGHT
1507};
cec736d2 1508
f268980d
LP
1509static int generic_array_bisect(
1510 JournalFile *f,
1511 uint64_t first,
1512 uint64_t n,
1513 uint64_t needle,
1514 int (*test_object)(JournalFile *f, uint64_t p, uint64_t needle),
1515 direction_t direction,
1516 Object **ret,
1517 uint64_t *offset,
1518 uint64_t *idx) {
1519
1520 uint64_t a, p, t = 0, i = 0, last_p = 0, last_index = (uint64_t) -1;
de190aef
LP
1521 bool subtract_one = false;
1522 Object *o, *array = NULL;
1523 int r;
a4bcff5b 1524 ChainCacheItem *ci;
cec736d2 1525
de190aef
LP
1526 assert(f);
1527 assert(test_object);
cec736d2 1528
a4bcff5b 1529 /* Start with the first array in the chain */
de190aef 1530 a = first;
a4bcff5b 1531
4743015d 1532 ci = ordered_hashmap_get(f->chain_cache, &first);
a4bcff5b
LP
1533 if (ci && n > ci->total) {
1534 /* Ah, we have iterated this bisection array chain
1535 * previously! Let's see if we can skip ahead in the
1536 * chain, as far as the last time. But we can't jump
1537 * backwards in the chain, so let's check that
1538 * first. */
1539
1540 r = test_object(f, ci->begin, needle);
1541 if (r < 0)
1542 return r;
1543
1544 if (r == TEST_LEFT) {
f268980d 1545 /* OK, what we are looking for is right of the
a4bcff5b
LP
1546 * begin of this EntryArray, so let's jump
1547 * straight to previously cached array in the
1548 * chain */
1549
1550 a = ci->array;
1551 n -= ci->total;
1552 t = ci->total;
f268980d 1553 last_index = ci->last_index;
a4bcff5b
LP
1554 }
1555 }
1556
de190aef
LP
1557 while (a > 0) {
1558 uint64_t left, right, k, lp;
1559
1560 r = journal_file_move_to_object(f, OBJECT_ENTRY_ARRAY, a, &array);
cec736d2
LP
1561 if (r < 0)
1562 return r;
1563
de190aef
LP
1564 k = journal_file_entry_array_n_items(array);
1565 right = MIN(k, n);
1566 if (right <= 0)
1567 return 0;
cec736d2 1568
de190aef
LP
1569 i = right - 1;
1570 lp = p = le64toh(array->entry_array.items[i]);
1571 if (p <= 0)
1572 return -EBADMSG;
cec736d2 1573
de190aef
LP
1574 r = test_object(f, p, needle);
1575 if (r < 0)
1576 return r;
cec736d2 1577
de190aef
LP
1578 if (r == TEST_FOUND)
1579 r = direction == DIRECTION_DOWN ? TEST_RIGHT : TEST_LEFT;
1580
1581 if (r == TEST_RIGHT) {
1582 left = 0;
1583 right -= 1;
f268980d
LP
1584
1585 if (last_index != (uint64_t) -1) {
1586 assert(last_index <= right);
1587
1588 /* If we cached the last index we
1589 * looked at, let's try to not to jump
1590 * too wildly around and see if we can
1591 * limit the range to look at early to
1592 * the immediate neighbors of the last
1593 * index we looked at. */
1594
1595 if (last_index > 0) {
1596 uint64_t x = last_index - 1;
1597
1598 p = le64toh(array->entry_array.items[x]);
1599 if (p <= 0)
1600 return -EBADMSG;
1601
1602 r = test_object(f, p, needle);
1603 if (r < 0)
1604 return r;
1605
1606 if (r == TEST_FOUND)
1607 r = direction == DIRECTION_DOWN ? TEST_RIGHT : TEST_LEFT;
1608
1609 if (r == TEST_RIGHT)
1610 right = x;
1611 else
1612 left = x + 1;
1613 }
1614
1615 if (last_index < right) {
1616 uint64_t y = last_index + 1;
1617
1618 p = le64toh(array->entry_array.items[y]);
1619 if (p <= 0)
1620 return -EBADMSG;
1621
1622 r = test_object(f, p, needle);
1623 if (r < 0)
1624 return r;
1625
1626 if (r == TEST_FOUND)
1627 r = direction == DIRECTION_DOWN ? TEST_RIGHT : TEST_LEFT;
1628
1629 if (r == TEST_RIGHT)
1630 right = y;
1631 else
1632 left = y + 1;
1633 }
f268980d
LP
1634 }
1635
de190aef
LP
1636 for (;;) {
1637 if (left == right) {
1638 if (direction == DIRECTION_UP)
1639 subtract_one = true;
1640
1641 i = left;
1642 goto found;
1643 }
1644
1645 assert(left < right);
de190aef 1646 i = (left + right) / 2;
f268980d 1647
de190aef
LP
1648 p = le64toh(array->entry_array.items[i]);
1649 if (p <= 0)
1650 return -EBADMSG;
1651
1652 r = test_object(f, p, needle);
1653 if (r < 0)
1654 return r;
cec736d2 1655
de190aef
LP
1656 if (r == TEST_FOUND)
1657 r = direction == DIRECTION_DOWN ? TEST_RIGHT : TEST_LEFT;
1658
1659 if (r == TEST_RIGHT)
1660 right = i;
1661 else
1662 left = i + 1;
1663 }
1664 }
1665
2173cbf8 1666 if (k >= n) {
cbdca852
LP
1667 if (direction == DIRECTION_UP) {
1668 i = n;
1669 subtract_one = true;
1670 goto found;
1671 }
1672
cec736d2 1673 return 0;
cbdca852 1674 }
cec736d2 1675
de190aef
LP
1676 last_p = lp;
1677
1678 n -= k;
1679 t += k;
f268980d 1680 last_index = (uint64_t) -1;
de190aef 1681 a = le64toh(array->entry_array.next_entry_array_offset);
cec736d2
LP
1682 }
1683
1684 return 0;
de190aef
LP
1685
1686found:
1687 if (subtract_one && t == 0 && i == 0)
1688 return 0;
1689
a4bcff5b 1690 /* Let's cache this item for the next invocation */
af13a6b0 1691 chain_cache_put(f->chain_cache, ci, first, a, le64toh(array->entry_array.items[0]), t, subtract_one ? (i > 0 ? i-1 : (uint64_t) -1) : i);
a4bcff5b 1692
de190aef
LP
1693 if (subtract_one && i == 0)
1694 p = last_p;
1695 else if (subtract_one)
1696 p = le64toh(array->entry_array.items[i-1]);
1697 else
1698 p = le64toh(array->entry_array.items[i]);
1699
1700 r = journal_file_move_to_object(f, OBJECT_ENTRY, p, &o);
1701 if (r < 0)
1702 return r;
1703
1704 if (ret)
1705 *ret = o;
1706
1707 if (offset)
1708 *offset = p;
1709
1710 if (idx)
cbdca852 1711 *idx = t + i + (subtract_one ? -1 : 0);
de190aef
LP
1712
1713 return 1;
cec736d2
LP
1714}
1715
f268980d
LP
1716
1717static int generic_array_bisect_plus_one(
1718 JournalFile *f,
1719 uint64_t extra,
1720 uint64_t first,
1721 uint64_t n,
1722 uint64_t needle,
1723 int (*test_object)(JournalFile *f, uint64_t p, uint64_t needle),
1724 direction_t direction,
1725 Object **ret,
1726 uint64_t *offset,
1727 uint64_t *idx) {
de190aef 1728
cec736d2 1729 int r;
cbdca852
LP
1730 bool step_back = false;
1731 Object *o;
cec736d2
LP
1732
1733 assert(f);
de190aef 1734 assert(test_object);
cec736d2 1735
de190aef
LP
1736 if (n <= 0)
1737 return 0;
cec736d2 1738
de190aef
LP
1739 /* This bisects the array in object 'first', but first checks
1740 * an extra */
de190aef
LP
1741 r = test_object(f, extra, needle);
1742 if (r < 0)
1743 return r;
a536e261
LP
1744
1745 if (r == TEST_FOUND)
1746 r = direction == DIRECTION_DOWN ? TEST_RIGHT : TEST_LEFT;
1747
cbdca852
LP
1748 /* if we are looking with DIRECTION_UP then we need to first
1749 see if in the actual array there is a matching entry, and
1750 return the last one of that. But if there isn't any we need
1751 to return this one. Hence remember this, and return it
1752 below. */
1753 if (r == TEST_LEFT)
1754 step_back = direction == DIRECTION_UP;
de190aef 1755
cbdca852
LP
1756 if (r == TEST_RIGHT) {
1757 if (direction == DIRECTION_DOWN)
1758 goto found;
1759 else
1760 return 0;
a536e261 1761 }
cec736d2 1762
de190aef
LP
1763 r = generic_array_bisect(f, first, n-1, needle, test_object, direction, ret, offset, idx);
1764
cbdca852
LP
1765 if (r == 0 && step_back)
1766 goto found;
1767
ecf68b1d 1768 if (r > 0 && idx)
de190aef
LP
1769 (*idx) ++;
1770
1771 return r;
cbdca852
LP
1772
1773found:
1774 r = journal_file_move_to_object(f, OBJECT_ENTRY, extra, &o);
1775 if (r < 0)
1776 return r;
1777
1778 if (ret)
1779 *ret = o;
1780
1781 if (offset)
1782 *offset = extra;
1783
1784 if (idx)
1785 *idx = 0;
1786
1787 return 1;
1788}
1789
44a6b1b6 1790_pure_ static int test_object_offset(JournalFile *f, uint64_t p, uint64_t needle) {
cbdca852
LP
1791 assert(f);
1792 assert(p > 0);
1793
1794 if (p == needle)
1795 return TEST_FOUND;
1796 else if (p < needle)
1797 return TEST_LEFT;
1798 else
1799 return TEST_RIGHT;
1800}
1801
de190aef
LP
1802static int test_object_seqnum(JournalFile *f, uint64_t p, uint64_t needle) {
1803 Object *o;
1804 int r;
1805
1806 assert(f);
1807 assert(p > 0);
1808
1809 r = journal_file_move_to_object(f, OBJECT_ENTRY, p, &o);
cec736d2
LP
1810 if (r < 0)
1811 return r;
1812
de190aef
LP
1813 if (le64toh(o->entry.seqnum) == needle)
1814 return TEST_FOUND;
1815 else if (le64toh(o->entry.seqnum) < needle)
1816 return TEST_LEFT;
1817 else
1818 return TEST_RIGHT;
1819}
cec736d2 1820
de190aef
LP
1821int journal_file_move_to_entry_by_seqnum(
1822 JournalFile *f,
1823 uint64_t seqnum,
1824 direction_t direction,
1825 Object **ret,
1826 uint64_t *offset) {
1827
1828 return generic_array_bisect(f,
1829 le64toh(f->header->entry_array_offset),
1830 le64toh(f->header->n_entries),
1831 seqnum,
1832 test_object_seqnum,
1833 direction,
1834 ret, offset, NULL);
1835}
cec736d2 1836
de190aef
LP
1837static int test_object_realtime(JournalFile *f, uint64_t p, uint64_t needle) {
1838 Object *o;
1839 int r;
1840
1841 assert(f);
1842 assert(p > 0);
1843
1844 r = journal_file_move_to_object(f, OBJECT_ENTRY, p, &o);
1845 if (r < 0)
1846 return r;
1847
1848 if (le64toh(o->entry.realtime) == needle)
1849 return TEST_FOUND;
1850 else if (le64toh(o->entry.realtime) < needle)
1851 return TEST_LEFT;
1852 else
1853 return TEST_RIGHT;
cec736d2
LP
1854}
1855
de190aef
LP
1856int journal_file_move_to_entry_by_realtime(
1857 JournalFile *f,
1858 uint64_t realtime,
1859 direction_t direction,
1860 Object **ret,
1861 uint64_t *offset) {
1862
1863 return generic_array_bisect(f,
1864 le64toh(f->header->entry_array_offset),
1865 le64toh(f->header->n_entries),
1866 realtime,
1867 test_object_realtime,
1868 direction,
1869 ret, offset, NULL);
1870}
1871
1872static int test_object_monotonic(JournalFile *f, uint64_t p, uint64_t needle) {
1873 Object *o;
1874 int r;
1875
1876 assert(f);
1877 assert(p > 0);
1878
1879 r = journal_file_move_to_object(f, OBJECT_ENTRY, p, &o);
1880 if (r < 0)
1881 return r;
1882
1883 if (le64toh(o->entry.monotonic) == needle)
1884 return TEST_FOUND;
1885 else if (le64toh(o->entry.monotonic) < needle)
1886 return TEST_LEFT;
1887 else
1888 return TEST_RIGHT;
1889}
1890
47838ab3
ZJS
1891static inline int find_data_object_by_boot_id(
1892 JournalFile *f,
1893 sd_id128_t boot_id,
1894 Object **o,
1895 uint64_t *b) {
1896 char t[sizeof("_BOOT_ID=")-1 + 32 + 1] = "_BOOT_ID=";
1897
1898 sd_id128_to_string(boot_id, t + 9);
1899 return journal_file_find_data_object(f, t, sizeof(t) - 1, o, b);
1900}
1901
de190aef
LP
1902int journal_file_move_to_entry_by_monotonic(
1903 JournalFile *f,
1904 sd_id128_t boot_id,
1905 uint64_t monotonic,
1906 direction_t direction,
1907 Object **ret,
1908 uint64_t *offset) {
1909
de190aef
LP
1910 Object *o;
1911 int r;
1912
cbdca852 1913 assert(f);
de190aef 1914
47838ab3 1915 r = find_data_object_by_boot_id(f, boot_id, &o, NULL);
de190aef
LP
1916 if (r < 0)
1917 return r;
cbdca852 1918 if (r == 0)
de190aef
LP
1919 return -ENOENT;
1920
1921 return generic_array_bisect_plus_one(f,
1922 le64toh(o->data.entry_offset),
1923 le64toh(o->data.entry_array_offset),
1924 le64toh(o->data.n_entries),
1925 monotonic,
1926 test_object_monotonic,
1927 direction,
1928 ret, offset, NULL);
1929}
1930
1fc605b0
MS
1931void journal_file_reset_location(JournalFile *f) {
1932 f->current_offset = 0;
1933}
1934
de190aef
LP
1935int journal_file_next_entry(
1936 JournalFile *f,
1937 Object *o, uint64_t p,
1938 direction_t direction,
1939 Object **ret, uint64_t *offset) {
1940
fb099c8d 1941 uint64_t i, n, ofs;
cec736d2
LP
1942 int r;
1943
1944 assert(f);
de190aef
LP
1945 assert(p > 0 || !o);
1946
1947 n = le64toh(f->header->n_entries);
1948 if (n <= 0)
1949 return 0;
cec736d2
LP
1950
1951 if (!o)
de190aef 1952 i = direction == DIRECTION_DOWN ? 0 : n - 1;
cec736d2 1953 else {
de190aef 1954 if (o->object.type != OBJECT_ENTRY)
cec736d2
LP
1955 return -EINVAL;
1956
de190aef
LP
1957 r = generic_array_bisect(f,
1958 le64toh(f->header->entry_array_offset),
1959 le64toh(f->header->n_entries),
1960 p,
1961 test_object_offset,
1962 DIRECTION_DOWN,
1963 NULL, NULL,
1964 &i);
1965 if (r <= 0)
1966 return r;
1967
1968 if (direction == DIRECTION_DOWN) {
1969 if (i >= n - 1)
1970 return 0;
1971
1972 i++;
1973 } else {
1974 if (i <= 0)
1975 return 0;
1976
1977 i--;
1978 }
cec736d2
LP
1979 }
1980
de190aef 1981 /* And jump to it */
fb099c8d
ZJS
1982 r = generic_array_get(f,
1983 le64toh(f->header->entry_array_offset),
1984 i,
1985 ret, &ofs);
1986 if (r <= 0)
1987 return r;
1988
1989 if (p > 0 &&
1990 (direction == DIRECTION_DOWN ? ofs <= p : ofs >= p)) {
1991 log_debug("%s: entry array corrupted at entry %"PRIu64,
1992 f->path, i);
1993 return -EBADMSG;
1994 }
1995
1996 if (offset)
1997 *offset = ofs;
1998
1999 return 1;
de190aef 2000}
cec736d2 2001
de190aef
LP
2002int journal_file_next_entry_for_data(
2003 JournalFile *f,
2004 Object *o, uint64_t p,
2005 uint64_t data_offset,
2006 direction_t direction,
2007 Object **ret, uint64_t *offset) {
2008
2009 uint64_t n, i;
cec736d2 2010 int r;
de190aef 2011 Object *d;
cec736d2
LP
2012
2013 assert(f);
de190aef 2014 assert(p > 0 || !o);
cec736d2 2015
de190aef 2016 r = journal_file_move_to_object(f, OBJECT_DATA, data_offset, &d);
466ccd92 2017 if (r < 0)
de190aef 2018 return r;
cec736d2 2019
de190aef
LP
2020 n = le64toh(d->data.n_entries);
2021 if (n <= 0)
2022 return n;
cec736d2 2023
de190aef
LP
2024 if (!o)
2025 i = direction == DIRECTION_DOWN ? 0 : n - 1;
2026 else {
2027 if (o->object.type != OBJECT_ENTRY)
2028 return -EINVAL;
cec736d2 2029
de190aef
LP
2030 r = generic_array_bisect_plus_one(f,
2031 le64toh(d->data.entry_offset),
2032 le64toh(d->data.entry_array_offset),
2033 le64toh(d->data.n_entries),
2034 p,
2035 test_object_offset,
2036 DIRECTION_DOWN,
2037 NULL, NULL,
2038 &i);
2039
2040 if (r <= 0)
cec736d2
LP
2041 return r;
2042
de190aef
LP
2043 if (direction == DIRECTION_DOWN) {
2044 if (i >= n - 1)
2045 return 0;
cec736d2 2046
de190aef
LP
2047 i++;
2048 } else {
2049 if (i <= 0)
2050 return 0;
cec736d2 2051
de190aef
LP
2052 i--;
2053 }
cec736d2 2054
de190aef 2055 }
cec736d2 2056
de190aef
LP
2057 return generic_array_get_plus_one(f,
2058 le64toh(d->data.entry_offset),
2059 le64toh(d->data.entry_array_offset),
2060 i,
2061 ret, offset);
2062}
cec736d2 2063
cbdca852
LP
2064int journal_file_move_to_entry_by_offset_for_data(
2065 JournalFile *f,
2066 uint64_t data_offset,
2067 uint64_t p,
2068 direction_t direction,
2069 Object **ret, uint64_t *offset) {
2070
2071 int r;
2072 Object *d;
2073
2074 assert(f);
2075
2076 r = journal_file_move_to_object(f, OBJECT_DATA, data_offset, &d);
2077 if (r < 0)
2078 return r;
2079
2080 return generic_array_bisect_plus_one(f,
2081 le64toh(d->data.entry_offset),
2082 le64toh(d->data.entry_array_offset),
2083 le64toh(d->data.n_entries),
2084 p,
2085 test_object_offset,
2086 direction,
2087 ret, offset, NULL);
2088}
2089
2090int journal_file_move_to_entry_by_monotonic_for_data(
2091 JournalFile *f,
2092 uint64_t data_offset,
2093 sd_id128_t boot_id,
2094 uint64_t monotonic,
2095 direction_t direction,
2096 Object **ret, uint64_t *offset) {
2097
cbdca852
LP
2098 Object *o, *d;
2099 int r;
2100 uint64_t b, z;
2101
2102 assert(f);
2103
2104 /* First, seek by time */
47838ab3 2105 r = find_data_object_by_boot_id(f, boot_id, &o, &b);
cbdca852
LP
2106 if (r < 0)
2107 return r;
2108 if (r == 0)
2109 return -ENOENT;
2110
2111 r = generic_array_bisect_plus_one(f,
2112 le64toh(o->data.entry_offset),
2113 le64toh(o->data.entry_array_offset),
2114 le64toh(o->data.n_entries),
2115 monotonic,
2116 test_object_monotonic,
2117 direction,
2118 NULL, &z, NULL);
2119 if (r <= 0)
2120 return r;
2121
2122 /* And now, continue seeking until we find an entry that
2123 * exists in both bisection arrays */
2124
2125 for (;;) {
2126 Object *qo;
2127 uint64_t p, q;
2128
2129 r = journal_file_move_to_object(f, OBJECT_DATA, data_offset, &d);
2130 if (r < 0)
2131 return r;
2132
2133 r = generic_array_bisect_plus_one(f,
2134 le64toh(d->data.entry_offset),
2135 le64toh(d->data.entry_array_offset),
2136 le64toh(d->data.n_entries),
2137 z,
2138 test_object_offset,
2139 direction,
2140 NULL, &p, NULL);
2141 if (r <= 0)
2142 return r;
2143
2144 r = journal_file_move_to_object(f, OBJECT_DATA, b, &o);
2145 if (r < 0)
2146 return r;
2147
2148 r = generic_array_bisect_plus_one(f,
2149 le64toh(o->data.entry_offset),
2150 le64toh(o->data.entry_array_offset),
2151 le64toh(o->data.n_entries),
2152 p,
2153 test_object_offset,
2154 direction,
2155 &qo, &q, NULL);
2156
2157 if (r <= 0)
2158 return r;
2159
2160 if (p == q) {
2161 if (ret)
2162 *ret = qo;
2163 if (offset)
2164 *offset = q;
2165
2166 return 1;
2167 }
2168
2169 z = q;
2170 }
cbdca852
LP
2171}
2172
de190aef
LP
2173int journal_file_move_to_entry_by_seqnum_for_data(
2174 JournalFile *f,
2175 uint64_t data_offset,
2176 uint64_t seqnum,
2177 direction_t direction,
2178 Object **ret, uint64_t *offset) {
cec736d2 2179
de190aef
LP
2180 Object *d;
2181 int r;
cec736d2 2182
91a31dde
LP
2183 assert(f);
2184
de190aef 2185 r = journal_file_move_to_object(f, OBJECT_DATA, data_offset, &d);
91a31dde 2186 if (r < 0)
de190aef 2187 return r;
cec736d2 2188
de190aef
LP
2189 return generic_array_bisect_plus_one(f,
2190 le64toh(d->data.entry_offset),
2191 le64toh(d->data.entry_array_offset),
2192 le64toh(d->data.n_entries),
2193 seqnum,
2194 test_object_seqnum,
2195 direction,
2196 ret, offset, NULL);
2197}
cec736d2 2198
de190aef
LP
2199int journal_file_move_to_entry_by_realtime_for_data(
2200 JournalFile *f,
2201 uint64_t data_offset,
2202 uint64_t realtime,
2203 direction_t direction,
2204 Object **ret, uint64_t *offset) {
2205
2206 Object *d;
2207 int r;
2208
91a31dde
LP
2209 assert(f);
2210
de190aef 2211 r = journal_file_move_to_object(f, OBJECT_DATA, data_offset, &d);
91a31dde 2212 if (r < 0)
de190aef
LP
2213 return r;
2214
2215 return generic_array_bisect_plus_one(f,
2216 le64toh(d->data.entry_offset),
2217 le64toh(d->data.entry_array_offset),
2218 le64toh(d->data.n_entries),
2219 realtime,
2220 test_object_realtime,
2221 direction,
2222 ret, offset, NULL);
cec736d2
LP
2223}
2224
0284adc6 2225void journal_file_dump(JournalFile *f) {
7560fffc 2226 Object *o;
7560fffc 2227 int r;
0284adc6 2228 uint64_t p;
7560fffc
LP
2229
2230 assert(f);
2231
0284adc6 2232 journal_file_print_header(f);
7560fffc 2233
0284adc6
LP
2234 p = le64toh(f->header->header_size);
2235 while (p != 0) {
d05089d8 2236 r = journal_file_move_to_object(f, OBJECT_UNUSED, p, &o);
0284adc6
LP
2237 if (r < 0)
2238 goto fail;
7560fffc 2239
0284adc6 2240 switch (o->object.type) {
d98cc1f2 2241
0284adc6
LP
2242 case OBJECT_UNUSED:
2243 printf("Type: OBJECT_UNUSED\n");
2244 break;
d98cc1f2 2245
0284adc6
LP
2246 case OBJECT_DATA:
2247 printf("Type: OBJECT_DATA\n");
2248 break;
7560fffc 2249
3c1668da
LP
2250 case OBJECT_FIELD:
2251 printf("Type: OBJECT_FIELD\n");
2252 break;
2253
0284adc6 2254 case OBJECT_ENTRY:
507f22bd
ZJS
2255 printf("Type: OBJECT_ENTRY seqnum=%"PRIu64" monotonic=%"PRIu64" realtime=%"PRIu64"\n",
2256 le64toh(o->entry.seqnum),
2257 le64toh(o->entry.monotonic),
2258 le64toh(o->entry.realtime));
0284adc6 2259 break;
7560fffc 2260
0284adc6
LP
2261 case OBJECT_FIELD_HASH_TABLE:
2262 printf("Type: OBJECT_FIELD_HASH_TABLE\n");
2263 break;
7560fffc 2264
0284adc6
LP
2265 case OBJECT_DATA_HASH_TABLE:
2266 printf("Type: OBJECT_DATA_HASH_TABLE\n");
2267 break;
7560fffc 2268
0284adc6
LP
2269 case OBJECT_ENTRY_ARRAY:
2270 printf("Type: OBJECT_ENTRY_ARRAY\n");
2271 break;
7560fffc 2272
0284adc6 2273 case OBJECT_TAG:
507f22bd
ZJS
2274 printf("Type: OBJECT_TAG seqnum=%"PRIu64" epoch=%"PRIu64"\n",
2275 le64toh(o->tag.seqnum),
2276 le64toh(o->tag.epoch));
0284adc6 2277 break;
3c1668da
LP
2278
2279 default:
2280 printf("Type: unknown (%u)\n", o->object.type);
2281 break;
0284adc6 2282 }
7560fffc 2283
d89c8fdf
ZJS
2284 if (o->object.flags & OBJECT_COMPRESSION_MASK)
2285 printf("Flags: %s\n",
2286 object_compressed_to_string(o->object.flags & OBJECT_COMPRESSION_MASK));
7560fffc 2287
0284adc6
LP
2288 if (p == le64toh(f->header->tail_object_offset))
2289 p = 0;
2290 else
2291 p = p + ALIGN64(le64toh(o->object.size));
2292 }
7560fffc 2293
0284adc6
LP
2294 return;
2295fail:
2296 log_error("File corrupt");
7560fffc
LP
2297}
2298
718fe4b1
ZJS
2299static const char* format_timestamp_safe(char *buf, size_t l, usec_t t) {
2300 const char *x;
2301
2302 x = format_timestamp(buf, l, t);
2303 if (x)
2304 return x;
2305 return " --- ";
2306}
2307
0284adc6 2308void journal_file_print_header(JournalFile *f) {
2765b7bb 2309 char a[33], b[33], c[33], d[33];
ed375beb 2310 char x[FORMAT_TIMESTAMP_MAX], y[FORMAT_TIMESTAMP_MAX], z[FORMAT_TIMESTAMP_MAX];
a1a03e30
LP
2311 struct stat st;
2312 char bytes[FORMAT_BYTES_MAX];
7560fffc
LP
2313
2314 assert(f);
7560fffc 2315
0284adc6
LP
2316 printf("File Path: %s\n"
2317 "File ID: %s\n"
2318 "Machine ID: %s\n"
2319 "Boot ID: %s\n"
2320 "Sequential Number ID: %s\n"
2321 "State: %s\n"
2322 "Compatible Flags:%s%s\n"
d89c8fdf 2323 "Incompatible Flags:%s%s%s\n"
507f22bd
ZJS
2324 "Header size: %"PRIu64"\n"
2325 "Arena size: %"PRIu64"\n"
2326 "Data Hash Table Size: %"PRIu64"\n"
2327 "Field Hash Table Size: %"PRIu64"\n"
0284adc6 2328 "Rotate Suggested: %s\n"
507f22bd
ZJS
2329 "Head Sequential Number: %"PRIu64"\n"
2330 "Tail Sequential Number: %"PRIu64"\n"
0284adc6 2331 "Head Realtime Timestamp: %s\n"
3223f44f 2332 "Tail Realtime Timestamp: %s\n"
ed375beb 2333 "Tail Monotonic Timestamp: %s\n"
507f22bd
ZJS
2334 "Objects: %"PRIu64"\n"
2335 "Entry Objects: %"PRIu64"\n",
0284adc6
LP
2336 f->path,
2337 sd_id128_to_string(f->header->file_id, a),
2338 sd_id128_to_string(f->header->machine_id, b),
2339 sd_id128_to_string(f->header->boot_id, c),
2765b7bb 2340 sd_id128_to_string(f->header->seqnum_id, d),
3223f44f
LP
2341 f->header->state == STATE_OFFLINE ? "OFFLINE" :
2342 f->header->state == STATE_ONLINE ? "ONLINE" :
2343 f->header->state == STATE_ARCHIVED ? "ARCHIVED" : "UNKNOWN",
8088cbd3 2344 JOURNAL_HEADER_SEALED(f->header) ? " SEALED" : "",
d89c8fdf
ZJS
2345 (le32toh(f->header->compatible_flags) & ~HEADER_COMPATIBLE_ANY) ? " ???" : "",
2346 JOURNAL_HEADER_COMPRESSED_XZ(f->header) ? " COMPRESSED-XZ" : "",
2347 JOURNAL_HEADER_COMPRESSED_LZ4(f->header) ? " COMPRESSED-LZ4" : "",
2348 (le32toh(f->header->incompatible_flags) & ~HEADER_INCOMPATIBLE_ANY) ? " ???" : "",
507f22bd
ZJS
2349 le64toh(f->header->header_size),
2350 le64toh(f->header->arena_size),
2351 le64toh(f->header->data_hash_table_size) / sizeof(HashItem),
2352 le64toh(f->header->field_hash_table_size) / sizeof(HashItem),
fb0951b0 2353 yes_no(journal_file_rotate_suggested(f, 0)),
507f22bd
ZJS
2354 le64toh(f->header->head_entry_seqnum),
2355 le64toh(f->header->tail_entry_seqnum),
718fe4b1
ZJS
2356 format_timestamp_safe(x, sizeof(x), le64toh(f->header->head_entry_realtime)),
2357 format_timestamp_safe(y, sizeof(y), le64toh(f->header->tail_entry_realtime)),
ed375beb 2358 format_timespan(z, sizeof(z), le64toh(f->header->tail_entry_monotonic), USEC_PER_MSEC),
507f22bd
ZJS
2359 le64toh(f->header->n_objects),
2360 le64toh(f->header->n_entries));
7560fffc 2361
0284adc6 2362 if (JOURNAL_HEADER_CONTAINS(f->header, n_data))
507f22bd 2363 printf("Data Objects: %"PRIu64"\n"
0284adc6 2364 "Data Hash Table Fill: %.1f%%\n",
507f22bd 2365 le64toh(f->header->n_data),
0284adc6 2366 100.0 * (double) le64toh(f->header->n_data) / ((double) (le64toh(f->header->data_hash_table_size) / sizeof(HashItem))));
7560fffc 2367
0284adc6 2368 if (JOURNAL_HEADER_CONTAINS(f->header, n_fields))
507f22bd 2369 printf("Field Objects: %"PRIu64"\n"
0284adc6 2370 "Field Hash Table Fill: %.1f%%\n",
507f22bd 2371 le64toh(f->header->n_fields),
0284adc6 2372 100.0 * (double) le64toh(f->header->n_fields) / ((double) (le64toh(f->header->field_hash_table_size) / sizeof(HashItem))));
3223f44f
LP
2373
2374 if (JOURNAL_HEADER_CONTAINS(f->header, n_tags))
507f22bd
ZJS
2375 printf("Tag Objects: %"PRIu64"\n",
2376 le64toh(f->header->n_tags));
3223f44f 2377 if (JOURNAL_HEADER_CONTAINS(f->header, n_entry_arrays))
507f22bd
ZJS
2378 printf("Entry Array Objects: %"PRIu64"\n",
2379 le64toh(f->header->n_entry_arrays));
a1a03e30
LP
2380
2381 if (fstat(f->fd, &st) >= 0)
2382 printf("Disk usage: %s\n", format_bytes(bytes, sizeof(bytes), (off_t) st.st_blocks * 512ULL));
7560fffc
LP
2383}
2384
0284adc6
LP
2385int journal_file_open(
2386 const char *fname,
2387 int flags,
2388 mode_t mode,
2389 bool compress,
baed47c3 2390 bool seal,
0284adc6
LP
2391 JournalMetrics *metrics,
2392 MMapCache *mmap_cache,
2393 JournalFile *template,
2394 JournalFile **ret) {
7560fffc 2395
0284adc6
LP
2396 JournalFile *f;
2397 int r;
2398 bool newly_created = false;
7560fffc 2399
0284adc6 2400 assert(fname);
0559d3a5 2401 assert(ret);
7560fffc 2402
0284adc6
LP
2403 if ((flags & O_ACCMODE) != O_RDONLY &&
2404 (flags & O_ACCMODE) != O_RDWR)
2405 return -EINVAL;
7560fffc 2406
a0108012
LP
2407 if (!endswith(fname, ".journal") &&
2408 !endswith(fname, ".journal~"))
0284adc6 2409 return -EINVAL;
7560fffc 2410
0284adc6
LP
2411 f = new0(JournalFile, 1);
2412 if (!f)
2413 return -ENOMEM;
7560fffc 2414
0284adc6
LP
2415 f->fd = -1;
2416 f->mode = mode;
7560fffc 2417
0284adc6
LP
2418 f->flags = flags;
2419 f->prot = prot_from_flags(flags);
2420 f->writable = (flags & O_ACCMODE) != O_RDONLY;
92261977 2421#if defined(HAVE_LZ4)
d89c8fdf
ZJS
2422 f->compress_lz4 = compress;
2423#elif defined(HAVE_XZ)
2424 f->compress_xz = compress;
48b61739 2425#endif
49a32d43 2426#ifdef HAVE_GCRYPT
baed47c3 2427 f->seal = seal;
49a32d43 2428#endif
7560fffc 2429
0284adc6
LP
2430 if (mmap_cache)
2431 f->mmap = mmap_cache_ref(mmap_cache);
2432 else {
84168d80 2433 f->mmap = mmap_cache_new();
0284adc6
LP
2434 if (!f->mmap) {
2435 r = -ENOMEM;
2436 goto fail;
2437 }
2438 }
7560fffc 2439
0284adc6
LP
2440 f->path = strdup(fname);
2441 if (!f->path) {
2442 r = -ENOMEM;
2443 goto fail;
2444 }
7560fffc 2445
4743015d 2446 f->chain_cache = ordered_hashmap_new(&uint64_hash_ops);
a4bcff5b
LP
2447 if (!f->chain_cache) {
2448 r = -ENOMEM;
2449 goto fail;
2450 }
2451
0284adc6
LP
2452 f->fd = open(f->path, f->flags|O_CLOEXEC, f->mode);
2453 if (f->fd < 0) {
2454 r = -errno;
2455 goto fail;
7560fffc 2456 }
7560fffc 2457
0284adc6
LP
2458 if (fstat(f->fd, &f->last_stat) < 0) {
2459 r = -errno;
2460 goto fail;
2461 }
7560fffc 2462
0284adc6 2463 if (f->last_stat.st_size == 0 && f->writable) {
fb0951b0
LP
2464 uint64_t crtime;
2465
2466 /* Let's attach the creation time to the journal file,
2467 * so that the vacuuming code knows the age of this
2468 * file even if the file might end up corrupted one
2469 * day... Ideally we'd just use the creation time many
2470 * file systems maintain for each file, but there is
2471 * currently no usable API to query this, hence let's
2472 * emulate this via extended attributes. If extended
2473 * attributes are not supported we'll just skip this,
7517e174 2474 * and rely solely on mtime/atime/ctime of the file. */
fb0951b0
LP
2475
2476 crtime = htole64((uint64_t) now(CLOCK_REALTIME));
2477 fsetxattr(f->fd, "user.crtime_usec", &crtime, sizeof(crtime), XATTR_CREATE);
7560fffc 2478
feb12d3e 2479#ifdef HAVE_GCRYPT
0284adc6 2480 /* Try to load the FSPRG state, and if we can't, then
baed47c3 2481 * just don't do sealing */
49a32d43
LP
2482 if (f->seal) {
2483 r = journal_file_fss_load(f);
2484 if (r < 0)
2485 f->seal = false;
2486 }
feb12d3e 2487#endif
7560fffc 2488
0284adc6
LP
2489 r = journal_file_init_header(f, template);
2490 if (r < 0)
2491 goto fail;
7560fffc 2492
0284adc6
LP
2493 if (fstat(f->fd, &f->last_stat) < 0) {
2494 r = -errno;
2495 goto fail;
2496 }
fb0951b0
LP
2497
2498 newly_created = true;
0284adc6 2499 }
7560fffc 2500
0284adc6
LP
2501 if (f->last_stat.st_size < (off_t) HEADER_SIZE_MIN) {
2502 r = -EIO;
2503 goto fail;
2504 }
7560fffc 2505
0284adc6
LP
2506 f->header = mmap(NULL, PAGE_ALIGN(sizeof(Header)), prot_from_flags(flags), MAP_SHARED, f->fd, 0);
2507 if (f->header == MAP_FAILED) {
2508 f->header = NULL;
2509 r = -errno;
2510 goto fail;
2511 }
7560fffc 2512
0284adc6
LP
2513 if (!newly_created) {
2514 r = journal_file_verify_header(f);
2515 if (r < 0)
2516 goto fail;
2517 }
7560fffc 2518
feb12d3e 2519#ifdef HAVE_GCRYPT
0284adc6 2520 if (!newly_created && f->writable) {
baed47c3 2521 r = journal_file_fss_load(f);
0284adc6
LP
2522 if (r < 0)
2523 goto fail;
2524 }
feb12d3e 2525#endif
cec736d2
LP
2526
2527 if (f->writable) {
4a92baf3
LP
2528 if (metrics) {
2529 journal_default_metrics(metrics, f->fd);
2530 f->metrics = *metrics;
2531 } else if (template)
2532 f->metrics = template->metrics;
2533
cec736d2
LP
2534 r = journal_file_refresh_header(f);
2535 if (r < 0)
2536 goto fail;
2537 }
2538
feb12d3e 2539#ifdef HAVE_GCRYPT
baed47c3 2540 r = journal_file_hmac_setup(f);
14d10188
LP
2541 if (r < 0)
2542 goto fail;
feb12d3e 2543#endif
14d10188 2544
cec736d2 2545 if (newly_created) {
de190aef 2546 r = journal_file_setup_field_hash_table(f);
cec736d2
LP
2547 if (r < 0)
2548 goto fail;
2549
de190aef 2550 r = journal_file_setup_data_hash_table(f);
cec736d2
LP
2551 if (r < 0)
2552 goto fail;
7560fffc 2553
feb12d3e 2554#ifdef HAVE_GCRYPT
7560fffc
LP
2555 r = journal_file_append_first_tag(f);
2556 if (r < 0)
2557 goto fail;
feb12d3e 2558#endif
cec736d2
LP
2559 }
2560
de190aef 2561 r = journal_file_map_field_hash_table(f);
cec736d2
LP
2562 if (r < 0)
2563 goto fail;
2564
de190aef 2565 r = journal_file_map_data_hash_table(f);
cec736d2
LP
2566 if (r < 0)
2567 goto fail;
2568
0559d3a5 2569 *ret = f;
cec736d2
LP
2570 return 0;
2571
2572fail:
2573 journal_file_close(f);
2574
2575 return r;
2576}
0ac38b70 2577
baed47c3 2578int journal_file_rotate(JournalFile **f, bool compress, bool seal) {
57535f47 2579 _cleanup_free_ char *p = NULL;
0ac38b70
LP
2580 size_t l;
2581 JournalFile *old_file, *new_file = NULL;
2582 int r;
2583
2584 assert(f);
2585 assert(*f);
2586
2587 old_file = *f;
2588
2589 if (!old_file->writable)
2590 return -EINVAL;
2591
2592 if (!endswith(old_file->path, ".journal"))
2593 return -EINVAL;
2594
2595 l = strlen(old_file->path);
57535f47
ZJS
2596 r = asprintf(&p, "%.*s@" SD_ID128_FORMAT_STR "-%016"PRIx64"-%016"PRIx64".journal",
2597 (int) l - 8, old_file->path,
2598 SD_ID128_FORMAT_VAL(old_file->header->seqnum_id),
2599 le64toh((*f)->header->head_entry_seqnum),
2600 le64toh((*f)->header->head_entry_realtime));
2601 if (r < 0)
0ac38b70
LP
2602 return -ENOMEM;
2603
0ac38b70 2604 r = rename(old_file->path, p);
0ac38b70
LP
2605 if (r < 0)
2606 return -errno;
2607
ccdbaf91 2608 old_file->header->state = STATE_ARCHIVED;
0ac38b70 2609
baed47c3 2610 r = journal_file_open(old_file->path, old_file->flags, old_file->mode, compress, seal, NULL, old_file->mmap, old_file, &new_file);
0ac38b70
LP
2611 journal_file_close(old_file);
2612
2613 *f = new_file;
2614 return r;
2615}
2616
9447a7f1
LP
2617int journal_file_open_reliably(
2618 const char *fname,
2619 int flags,
2620 mode_t mode,
7560fffc 2621 bool compress,
baed47c3 2622 bool seal,
4a92baf3 2623 JournalMetrics *metrics,
27370278 2624 MMapCache *mmap_cache,
9447a7f1
LP
2625 JournalFile *template,
2626 JournalFile **ret) {
2627
2628 int r;
2629 size_t l;
ed375beb 2630 _cleanup_free_ char *p = NULL;
9447a7f1 2631
baed47c3 2632 r = journal_file_open(fname, flags, mode, compress, seal,
27370278 2633 metrics, mmap_cache, template, ret);
0071d9f1
LP
2634 if (r != -EBADMSG && /* corrupted */
2635 r != -ENODATA && /* truncated */
2636 r != -EHOSTDOWN && /* other machine */
a1a1898f
LP
2637 r != -EPROTONOSUPPORT && /* incompatible feature */
2638 r != -EBUSY && /* unclean shutdown */
2639 r != -ESHUTDOWN /* already archived */)
9447a7f1
LP
2640 return r;
2641
2642 if ((flags & O_ACCMODE) == O_RDONLY)
2643 return r;
2644
2645 if (!(flags & O_CREAT))
2646 return r;
2647
7560fffc
LP
2648 if (!endswith(fname, ".journal"))
2649 return r;
2650
5c70eab4
LP
2651 /* The file is corrupted. Rotate it away and try it again (but only once) */
2652
9447a7f1 2653 l = strlen(fname);
9bf3b535 2654 if (asprintf(&p, "%.*s@%016llx-%016" PRIx64 ".journal~",
57535f47 2655 (int) l - 8, fname,
9447a7f1 2656 (unsigned long long) now(CLOCK_REALTIME),
9bf3b535 2657 random_u64()) < 0)
9447a7f1
LP
2658 return -ENOMEM;
2659
2660 r = rename(fname, p);
9447a7f1
LP
2661 if (r < 0)
2662 return -errno;
2663
a1a1898f 2664 log_warning("File %s corrupted or uncleanly shut down, renaming and replacing.", fname);
9447a7f1 2665
baed47c3 2666 return journal_file_open(fname, flags, mode, compress, seal,
27370278 2667 metrics, mmap_cache, template, ret);
9447a7f1
LP
2668}
2669
cf244689
LP
2670int journal_file_copy_entry(JournalFile *from, JournalFile *to, Object *o, uint64_t p, uint64_t *seqnum, Object **ret, uint64_t *offset) {
2671 uint64_t i, n;
2672 uint64_t q, xor_hash = 0;
2673 int r;
2674 EntryItem *items;
2675 dual_timestamp ts;
2676
2677 assert(from);
2678 assert(to);
2679 assert(o);
2680 assert(p);
2681
2682 if (!to->writable)
2683 return -EPERM;
2684
2685 ts.monotonic = le64toh(o->entry.monotonic);
2686 ts.realtime = le64toh(o->entry.realtime);
2687
cf244689 2688 n = journal_file_entry_n_items(o);
4faa7004
TA
2689 /* alloca() can't take 0, hence let's allocate at least one */
2690 items = alloca(sizeof(EntryItem) * MAX(1u, n));
cf244689
LP
2691
2692 for (i = 0; i < n; i++) {
4fd052ae
FC
2693 uint64_t l, h;
2694 le64_t le_hash;
cf244689
LP
2695 size_t t;
2696 void *data;
2697 Object *u;
2698
2699 q = le64toh(o->entry.items[i].object_offset);
2700 le_hash = o->entry.items[i].hash;
2701
2702 r = journal_file_move_to_object(from, OBJECT_DATA, q, &o);
2703 if (r < 0)
2704 return r;
2705
2706 if (le_hash != o->data.hash)
2707 return -EBADMSG;
2708
2709 l = le64toh(o->object.size) - offsetof(Object, data.payload);
2710 t = (size_t) l;
2711
2712 /* We hit the limit on 32bit machines */
2713 if ((uint64_t) t != l)
2714 return -E2BIG;
2715
d89c8fdf 2716 if (o->object.flags & OBJECT_COMPRESSION_MASK) {
3b1a55e1 2717#if defined(HAVE_XZ) || defined(HAVE_LZ4)
fa1c4b51 2718 size_t rsize;
cf244689 2719
d89c8fdf
ZJS
2720 r = decompress_blob(o->object.flags & OBJECT_COMPRESSION_MASK,
2721 o->data.payload, l, &from->compress_buffer, &from->compress_buffer_size, &rsize, 0);
2722 if (r < 0)
2723 return r;
cf244689
LP
2724
2725 data = from->compress_buffer;
2726 l = rsize;
3b1a55e1
ZJS
2727#else
2728 return -EPROTONOSUPPORT;
2729#endif
cf244689
LP
2730 } else
2731 data = o->data.payload;
2732
2733 r = journal_file_append_data(to, data, l, &u, &h);
2734 if (r < 0)
2735 return r;
2736
2737 xor_hash ^= le64toh(u->data.hash);
2738 items[i].object_offset = htole64(h);
2739 items[i].hash = u->data.hash;
2740
2741 r = journal_file_move_to_object(from, OBJECT_ENTRY, p, &o);
2742 if (r < 0)
2743 return r;
2744 }
2745
2746 return journal_file_append_entry_internal(to, &ts, xor_hash, items, n, seqnum, ret, offset);
2747}
babfc091
LP
2748
2749void journal_default_metrics(JournalMetrics *m, int fd) {
2750 uint64_t fs_size = 0;
2751 struct statvfs ss;
a7bc2c2a 2752 char a[FORMAT_BYTES_MAX], b[FORMAT_BYTES_MAX], c[FORMAT_BYTES_MAX], d[FORMAT_BYTES_MAX];
babfc091
LP
2753
2754 assert(m);
2755 assert(fd >= 0);
2756
2757 if (fstatvfs(fd, &ss) >= 0)
2758 fs_size = ss.f_frsize * ss.f_blocks;
2759
2760 if (m->max_use == (uint64_t) -1) {
2761
2762 if (fs_size > 0) {
2763 m->max_use = PAGE_ALIGN(fs_size / 10); /* 10% of file system size */
2764
2765 if (m->max_use > DEFAULT_MAX_USE_UPPER)
2766 m->max_use = DEFAULT_MAX_USE_UPPER;
2767
2768 if (m->max_use < DEFAULT_MAX_USE_LOWER)
2769 m->max_use = DEFAULT_MAX_USE_LOWER;
2770 } else
2771 m->max_use = DEFAULT_MAX_USE_LOWER;
2772 } else {
2773 m->max_use = PAGE_ALIGN(m->max_use);
2774
2775 if (m->max_use < JOURNAL_FILE_SIZE_MIN*2)
2776 m->max_use = JOURNAL_FILE_SIZE_MIN*2;
2777 }
2778
2779 if (m->max_size == (uint64_t) -1) {
2780 m->max_size = PAGE_ALIGN(m->max_use / 8); /* 8 chunks */
2781
2782 if (m->max_size > DEFAULT_MAX_SIZE_UPPER)
2783 m->max_size = DEFAULT_MAX_SIZE_UPPER;
2784 } else
2785 m->max_size = PAGE_ALIGN(m->max_size);
2786
2787 if (m->max_size < JOURNAL_FILE_SIZE_MIN)
2788 m->max_size = JOURNAL_FILE_SIZE_MIN;
2789
2790 if (m->max_size*2 > m->max_use)
2791 m->max_use = m->max_size*2;
2792
2793 if (m->min_size == (uint64_t) -1)
2794 m->min_size = JOURNAL_FILE_SIZE_MIN;
2795 else {
2796 m->min_size = PAGE_ALIGN(m->min_size);
2797
2798 if (m->min_size < JOURNAL_FILE_SIZE_MIN)
2799 m->min_size = JOURNAL_FILE_SIZE_MIN;
2800
2801 if (m->min_size > m->max_size)
2802 m->max_size = m->min_size;
2803 }
2804
2805 if (m->keep_free == (uint64_t) -1) {
2806
2807 if (fs_size > 0) {
8621b110 2808 m->keep_free = PAGE_ALIGN(fs_size * 3 / 20); /* 15% of file system size */
babfc091
LP
2809
2810 if (m->keep_free > DEFAULT_KEEP_FREE_UPPER)
2811 m->keep_free = DEFAULT_KEEP_FREE_UPPER;
2812
2813 } else
2814 m->keep_free = DEFAULT_KEEP_FREE;
2815 }
2816
2b43f939
LP
2817 log_debug("Fixed max_use=%s max_size=%s min_size=%s keep_free=%s",
2818 format_bytes(a, sizeof(a), m->max_use),
2819 format_bytes(b, sizeof(b), m->max_size),
2820 format_bytes(c, sizeof(c), m->min_size),
2821 format_bytes(d, sizeof(d), m->keep_free));
babfc091 2822}
08984293
LP
2823
2824int journal_file_get_cutoff_realtime_usec(JournalFile *f, usec_t *from, usec_t *to) {
08984293
LP
2825 assert(f);
2826 assert(from || to);
2827
2828 if (from) {
162566a4
LP
2829 if (f->header->head_entry_realtime == 0)
2830 return -ENOENT;
08984293 2831
162566a4 2832 *from = le64toh(f->header->head_entry_realtime);
08984293
LP
2833 }
2834
2835 if (to) {
162566a4
LP
2836 if (f->header->tail_entry_realtime == 0)
2837 return -ENOENT;
08984293 2838
162566a4 2839 *to = le64toh(f->header->tail_entry_realtime);
08984293
LP
2840 }
2841
2842 return 1;
2843}
2844
2845int journal_file_get_cutoff_monotonic_usec(JournalFile *f, sd_id128_t boot_id, usec_t *from, usec_t *to) {
08984293
LP
2846 Object *o;
2847 uint64_t p;
2848 int r;
2849
2850 assert(f);
2851 assert(from || to);
2852
47838ab3 2853 r = find_data_object_by_boot_id(f, boot_id, &o, &p);
08984293
LP
2854 if (r <= 0)
2855 return r;
2856
2857 if (le64toh(o->data.n_entries) <= 0)
2858 return 0;
2859
2860 if (from) {
2861 r = journal_file_move_to_object(f, OBJECT_ENTRY, le64toh(o->data.entry_offset), &o);
2862 if (r < 0)
2863 return r;
2864
2865 *from = le64toh(o->entry.monotonic);
2866 }
2867
2868 if (to) {
2869 r = journal_file_move_to_object(f, OBJECT_DATA, p, &o);
2870 if (r < 0)
2871 return r;
2872
2873 r = generic_array_get_plus_one(f,
2874 le64toh(o->data.entry_offset),
2875 le64toh(o->data.entry_array_offset),
2876 le64toh(o->data.n_entries)-1,
2877 &o, NULL);
2878 if (r <= 0)
2879 return r;
2880
2881 *to = le64toh(o->entry.monotonic);
2882 }
2883
2884 return 1;
2885}
dca6219e 2886
fb0951b0 2887bool journal_file_rotate_suggested(JournalFile *f, usec_t max_file_usec) {
dca6219e
LP
2888 assert(f);
2889
2890 /* If we gained new header fields we gained new features,
2891 * hence suggest a rotation */
361f9cbc
LP
2892 if (le64toh(f->header->header_size) < sizeof(Header)) {
2893 log_debug("%s uses an outdated header, suggesting rotation.", f->path);
dca6219e 2894 return true;
361f9cbc 2895 }
dca6219e
LP
2896
2897 /* Let's check if the hash tables grew over a certain fill
2898 * level (75%, borrowing this value from Java's hash table
2899 * implementation), and if so suggest a rotation. To calculate
2900 * the fill level we need the n_data field, which only exists
2901 * in newer versions. */
2902
2903 if (JOURNAL_HEADER_CONTAINS(f->header, n_data))
361f9cbc 2904 if (le64toh(f->header->n_data) * 4ULL > (le64toh(f->header->data_hash_table_size) / sizeof(HashItem)) * 3ULL) {
507f22bd 2905 log_debug("Data hash table of %s has a fill level at %.1f (%"PRIu64" of %"PRIu64" items, %llu file size, %"PRIu64" bytes per hash table item), suggesting rotation.",
361f9cbc
LP
2906 f->path,
2907 100.0 * (double) le64toh(f->header->n_data) / ((double) (le64toh(f->header->data_hash_table_size) / sizeof(HashItem))),
507f22bd
ZJS
2908 le64toh(f->header->n_data),
2909 le64toh(f->header->data_hash_table_size) / sizeof(HashItem),
2910 (unsigned long long) f->last_stat.st_size,
2911 f->last_stat.st_size / le64toh(f->header->n_data));
dca6219e 2912 return true;
361f9cbc 2913 }
dca6219e
LP
2914
2915 if (JOURNAL_HEADER_CONTAINS(f->header, n_fields))
361f9cbc 2916 if (le64toh(f->header->n_fields) * 4ULL > (le64toh(f->header->field_hash_table_size) / sizeof(HashItem)) * 3ULL) {
507f22bd 2917 log_debug("Field hash table of %s has a fill level at %.1f (%"PRIu64" of %"PRIu64" items), suggesting rotation.",
361f9cbc
LP
2918 f->path,
2919 100.0 * (double) le64toh(f->header->n_fields) / ((double) (le64toh(f->header->field_hash_table_size) / sizeof(HashItem))),
507f22bd
ZJS
2920 le64toh(f->header->n_fields),
2921 le64toh(f->header->field_hash_table_size) / sizeof(HashItem));
dca6219e 2922 return true;
361f9cbc 2923 }
dca6219e 2924
0598fd4a
LP
2925 /* Are the data objects properly indexed by field objects? */
2926 if (JOURNAL_HEADER_CONTAINS(f->header, n_data) &&
2927 JOURNAL_HEADER_CONTAINS(f->header, n_fields) &&
2928 le64toh(f->header->n_data) > 0 &&
2929 le64toh(f->header->n_fields) == 0)
2930 return true;
2931
fb0951b0
LP
2932 if (max_file_usec > 0) {
2933 usec_t t, h;
2934
2935 h = le64toh(f->header->head_entry_realtime);
2936 t = now(CLOCK_REALTIME);
2937
2938 if (h > 0 && t > h + max_file_usec)
2939 return true;
2940 }
2941
dca6219e
LP
2942 return false;
2943}