]> git.ipfire.org Git - thirdparty/systemd.git/blame - src/journal/journal-file.c
install: make InstallContext::{will_install,have_installed} OrderedHashmaps
[thirdparty/systemd.git] / src / journal / journal-file.c
CommitLineData
cec736d2
LP
1/*-*- Mode: C; c-basic-offset: 8; indent-tabs-mode: nil -*-*/
2
3/***
4 This file is part of systemd.
5
6 Copyright 2011 Lennart Poettering
7
8 systemd is free software; you can redistribute it and/or modify it
5430f7f2
LP
9 under the terms of the GNU Lesser General Public License as published by
10 the Free Software Foundation; either version 2.1 of the License, or
cec736d2
LP
11 (at your option) any later version.
12
13 systemd is distributed in the hope that it will be useful, but
14 WITHOUT ANY WARRANTY; without even the implied warranty of
15 MERCHANTABILITY or FITNESS FOR A PARTICULAR PURPOSE. See the GNU
5430f7f2 16 Lesser General Public License for more details.
cec736d2 17
5430f7f2 18 You should have received a copy of the GNU Lesser General Public License
cec736d2
LP
19 along with systemd; If not, see <http://www.gnu.org/licenses/>.
20***/
21
22#include <sys/mman.h>
23#include <errno.h>
24#include <sys/uio.h>
25#include <unistd.h>
26#include <sys/statvfs.h>
27#include <fcntl.h>
28#include <stddef.h>
d2edfae0 29#include <sys/xattr.h>
fb0951b0 30
cec736d2
LP
31#include "journal-def.h"
32#include "journal-file.h"
0284adc6 33#include "journal-authenticate.h"
cec736d2 34#include "lookup3.h"
807e17f0 35#include "compress.h"
7560fffc 36#include "fsprg.h"
cec736d2 37
4a92baf3
LP
38#define DEFAULT_DATA_HASH_TABLE_SIZE (2047ULL*sizeof(HashItem))
39#define DEFAULT_FIELD_HASH_TABLE_SIZE (333ULL*sizeof(HashItem))
cec736d2 40
be19b7df 41#define COMPRESSION_SIZE_THRESHOLD (512ULL)
807e17f0 42
babfc091 43/* This is the minimum journal file size */
253f59df 44#define JOURNAL_FILE_SIZE_MIN (4ULL*1024ULL*1024ULL) /* 4 MiB */
babfc091
LP
45
46/* These are the lower and upper bounds if we deduce the max_use value
47 * from the file system size */
48#define DEFAULT_MAX_USE_LOWER (1ULL*1024ULL*1024ULL) /* 1 MiB */
49#define DEFAULT_MAX_USE_UPPER (4ULL*1024ULL*1024ULL*1024ULL) /* 4 GiB */
50
51/* This is the upper bound if we deduce max_size from max_use */
71100051 52#define DEFAULT_MAX_SIZE_UPPER (128ULL*1024ULL*1024ULL) /* 128 MiB */
babfc091
LP
53
54/* This is the upper bound if we deduce the keep_free value from the
55 * file system size */
56#define DEFAULT_KEEP_FREE_UPPER (4ULL*1024ULL*1024ULL*1024ULL) /* 4 GiB */
57
58/* This is the keep_free value when we can't determine the system
59 * size */
60#define DEFAULT_KEEP_FREE (1024ULL*1024ULL) /* 1 MB */
61
dca6219e
LP
62/* n_data was the first entry we added after the initial file format design */
63#define HEADER_SIZE_MIN ALIGN64(offsetof(Header, n_data))
cec736d2 64
a4bcff5b
LP
65/* How many entries to keep in the entry array chain cache at max */
66#define CHAIN_CACHE_MAX 20
67
a676e665
LP
68/* How much to increase the journal file size at once each time we allocate something new. */
69#define FILE_SIZE_INCREASE (8ULL*1024ULL*1024ULL) /* 8MB */
70
9588bc32 71static int journal_file_set_online(JournalFile *f) {
26687bf8
OS
72 assert(f);
73
74 if (!f->writable)
75 return -EPERM;
76
77 if (!(f->fd >= 0 && f->header))
78 return -EINVAL;
79
80 switch(f->header->state) {
81 case STATE_ONLINE:
82 return 0;
83
84 case STATE_OFFLINE:
85 f->header->state = STATE_ONLINE;
86 fsync(f->fd);
87 return 0;
88
89 default:
90 return -EINVAL;
91 }
92}
93
94int journal_file_set_offline(JournalFile *f) {
95 assert(f);
96
97 if (!f->writable)
98 return -EPERM;
99
100 if (!(f->fd >= 0 && f->header))
101 return -EINVAL;
102
103 if (f->header->state != STATE_ONLINE)
104 return 0;
105
106 fsync(f->fd);
107
108 f->header->state = STATE_OFFLINE;
109
110 fsync(f->fd);
111
112 return 0;
113}
114
cec736d2 115void journal_file_close(JournalFile *f) {
de190aef 116 assert(f);
cec736d2 117
feb12d3e 118#ifdef HAVE_GCRYPT
b0af6f41 119 /* Write the final tag */
c586dbf1 120 if (f->seal && f->writable)
b0af6f41 121 journal_file_append_tag(f);
feb12d3e 122#endif
b0af6f41 123
7560fffc 124 /* Sync everything to disk, before we mark the file offline */
16e9f408
LP
125 if (f->mmap && f->fd >= 0)
126 mmap_cache_close_fd(f->mmap, f->fd);
7560fffc 127
26687bf8 128 journal_file_set_offline(f);
cec736d2 129
26687bf8 130 if (f->header)
d384c7a8 131 munmap(f->header, PAGE_ALIGN(sizeof(Header)));
cec736d2 132
03e334a1 133 safe_close(f->fd);
cec736d2 134 free(f->path);
807e17f0 135
16e9f408
LP
136 if (f->mmap)
137 mmap_cache_unref(f->mmap);
138
a4bcff5b
LP
139 hashmap_free_free(f->chain_cache);
140
d89c8fdf 141#if defined(HAVE_XZ) || defined(HAVE_LZ4)
807e17f0
LP
142 free(f->compress_buffer);
143#endif
144
7560fffc 145#ifdef HAVE_GCRYPT
baed47c3
LP
146 if (f->fss_file)
147 munmap(f->fss_file, PAGE_ALIGN(f->fss_file_size));
b7c9ae91
LP
148 else if (f->fsprg_state)
149 free(f->fsprg_state);
150
151 free(f->fsprg_seed);
7560fffc
LP
152
153 if (f->hmac)
154 gcry_md_close(f->hmac);
155#endif
156
cec736d2
LP
157 free(f);
158}
159
0ac38b70 160static int journal_file_init_header(JournalFile *f, JournalFile *template) {
d89c8fdf 161 Header h = {};
cec736d2
LP
162 ssize_t k;
163 int r;
164
165 assert(f);
166
7560fffc 167 memcpy(h.signature, HEADER_SIGNATURE, 8);
23b0b2b2 168 h.header_size = htole64(ALIGN64(sizeof(h)));
cec736d2 169
d89c8fdf
ZJS
170 h.incompatible_flags |= htole32(
171 f->compress_xz * HEADER_INCOMPATIBLE_COMPRESSED_XZ |
172 f->compress_lz4 * HEADER_INCOMPATIBLE_COMPRESSED_LZ4);
7560fffc 173
d89c8fdf
ZJS
174 h.compatible_flags = htole32(
175 f->seal * HEADER_COMPATIBLE_SEALED);
7560fffc 176
cec736d2
LP
177 r = sd_id128_randomize(&h.file_id);
178 if (r < 0)
179 return r;
180
0ac38b70
LP
181 if (template) {
182 h.seqnum_id = template->header->seqnum_id;
beec0085 183 h.tail_entry_seqnum = template->header->tail_entry_seqnum;
0ac38b70
LP
184 } else
185 h.seqnum_id = h.file_id;
cec736d2
LP
186
187 k = pwrite(f->fd, &h, sizeof(h), 0);
188 if (k < 0)
189 return -errno;
190
191 if (k != sizeof(h))
192 return -EIO;
193
194 return 0;
195}
196
197static int journal_file_refresh_header(JournalFile *f) {
198 int r;
de190aef 199 sd_id128_t boot_id;
cec736d2
LP
200
201 assert(f);
202
203 r = sd_id128_get_machine(&f->header->machine_id);
204 if (r < 0)
205 return r;
206
de190aef 207 r = sd_id128_get_boot(&boot_id);
cec736d2
LP
208 if (r < 0)
209 return r;
210
de190aef
LP
211 if (sd_id128_equal(boot_id, f->header->boot_id))
212 f->tail_entry_monotonic_valid = true;
213
214 f->header->boot_id = boot_id;
215
26687bf8 216 journal_file_set_online(f);
b788cc23 217
7560fffc 218 /* Sync the online state to disk */
a676e665 219 fsync(f->fd);
b788cc23 220
cec736d2
LP
221 return 0;
222}
223
224static int journal_file_verify_header(JournalFile *f) {
d89c8fdf
ZJS
225 uint32_t flags;
226
cec736d2
LP
227 assert(f);
228
7560fffc 229 if (memcmp(f->header->signature, HEADER_SIGNATURE, 8))
cec736d2
LP
230 return -EBADMSG;
231
7560fffc
LP
232 /* In both read and write mode we refuse to open files with
233 * incompatible flags we don't know */
d89c8fdf
ZJS
234 flags = le32toh(f->header->incompatible_flags);
235 if (flags & ~HEADER_INCOMPATIBLE_SUPPORTED) {
236 if (flags & ~HEADER_INCOMPATIBLE_ANY)
237 log_debug("Journal file %s has unknown incompatible flags %"PRIx32,
238 f->path, flags & ~HEADER_INCOMPATIBLE_ANY);
239 flags = (flags & HEADER_INCOMPATIBLE_ANY) & ~HEADER_INCOMPATIBLE_SUPPORTED;
240 if (flags)
241 log_debug("Journal file %s uses incompatible flags %"PRIx32
242 " disabled at compilation time.", f->path, flags);
cec736d2 243 return -EPROTONOSUPPORT;
d89c8fdf 244 }
cec736d2 245
7560fffc
LP
246 /* When open for writing we refuse to open files with
247 * compatible flags, too */
d89c8fdf
ZJS
248 flags = le32toh(f->header->compatible_flags);
249 if (f->writable && (flags & ~HEADER_COMPATIBLE_SUPPORTED)) {
250 if (flags & ~HEADER_COMPATIBLE_ANY)
251 log_debug("Journal file %s has unknown compatible flags %"PRIx32,
252 f->path, flags & ~HEADER_COMPATIBLE_ANY);
253 flags = (flags & HEADER_COMPATIBLE_ANY) & ~HEADER_COMPATIBLE_SUPPORTED;
254 if (flags)
255 log_debug("Journal file %s uses compatible flags %"PRIx32
256 " disabled at compilation time.", f->path, flags);
257 return -EPROTONOSUPPORT;
7560fffc
LP
258 }
259
db11ac1a
LP
260 if (f->header->state >= _STATE_MAX)
261 return -EBADMSG;
262
dca6219e
LP
263 /* The first addition was n_data, so check that we are at least this large */
264 if (le64toh(f->header->header_size) < HEADER_SIZE_MIN)
23b0b2b2
LP
265 return -EBADMSG;
266
8088cbd3 267 if (JOURNAL_HEADER_SEALED(f->header) && !JOURNAL_HEADER_CONTAINS(f->header, n_entry_arrays))
beec0085
LP
268 return -EBADMSG;
269
db11ac1a
LP
270 if ((le64toh(f->header->header_size) + le64toh(f->header->arena_size)) > (uint64_t) f->last_stat.st_size)
271 return -ENODATA;
272
273 if (le64toh(f->header->tail_object_offset) > (le64toh(f->header->header_size) + le64toh(f->header->arena_size)))
274 return -ENODATA;
275
7762e02b
LP
276 if (!VALID64(le64toh(f->header->data_hash_table_offset)) ||
277 !VALID64(le64toh(f->header->field_hash_table_offset)) ||
278 !VALID64(le64toh(f->header->tail_object_offset)) ||
279 !VALID64(le64toh(f->header->entry_array_offset)))
280 return -ENODATA;
281
cec736d2 282 if (f->writable) {
ccdbaf91 283 uint8_t state;
cec736d2
LP
284 sd_id128_t machine_id;
285 int r;
286
287 r = sd_id128_get_machine(&machine_id);
288 if (r < 0)
289 return r;
290
291 if (!sd_id128_equal(machine_id, f->header->machine_id))
292 return -EHOSTDOWN;
293
de190aef 294 state = f->header->state;
cec736d2 295
71fa6f00
LP
296 if (state == STATE_ONLINE) {
297 log_debug("Journal file %s is already online. Assuming unclean closing.", f->path);
298 return -EBUSY;
299 } else if (state == STATE_ARCHIVED)
cec736d2 300 return -ESHUTDOWN;
71fa6f00
LP
301 else if (state != STATE_OFFLINE) {
302 log_debug("Journal file %s has unknown state %u.", f->path, state);
303 return -EBUSY;
304 }
cec736d2
LP
305 }
306
d89c8fdf
ZJS
307 f->compress_xz = JOURNAL_HEADER_COMPRESSED_XZ(f->header);
308 f->compress_lz4 = JOURNAL_HEADER_COMPRESSED_LZ4(f->header);
c586dbf1 309
f1889c91 310 f->seal = JOURNAL_HEADER_SEALED(f->header);
7560fffc 311
cec736d2
LP
312 return 0;
313}
314
315static int journal_file_allocate(JournalFile *f, uint64_t offset, uint64_t size) {
eda4b58b 316 uint64_t old_size, new_size;
fec2aa2f 317 int r;
cec736d2
LP
318
319 assert(f);
320
cec736d2 321 /* We assume that this file is not sparse, and we know that
38ac38b2 322 * for sure, since we always call posix_fallocate()
cec736d2
LP
323 * ourselves */
324
325 old_size =
23b0b2b2 326 le64toh(f->header->header_size) +
cec736d2
LP
327 le64toh(f->header->arena_size);
328
bc85bfee 329 new_size = PAGE_ALIGN(offset + size);
23b0b2b2
LP
330 if (new_size < le64toh(f->header->header_size))
331 new_size = le64toh(f->header->header_size);
bc85bfee
LP
332
333 if (new_size <= old_size)
cec736d2
LP
334 return 0;
335
a676e665 336 if (f->metrics.max_size > 0 && new_size > f->metrics.max_size)
bc85bfee 337 return -E2BIG;
cec736d2 338
a676e665 339 if (new_size > f->metrics.min_size && f->metrics.keep_free > 0) {
cec736d2
LP
340 struct statvfs svfs;
341
342 if (fstatvfs(f->fd, &svfs) >= 0) {
343 uint64_t available;
344
345 available = svfs.f_bfree * svfs.f_bsize;
346
bc85bfee
LP
347 if (available >= f->metrics.keep_free)
348 available -= f->metrics.keep_free;
cec736d2
LP
349 else
350 available = 0;
351
352 if (new_size - old_size > available)
353 return -E2BIG;
354 }
355 }
356
eda4b58b
LP
357 /* Increase by larger blocks at once */
358 new_size = ((new_size+FILE_SIZE_INCREASE-1) / FILE_SIZE_INCREASE) * FILE_SIZE_INCREASE;
359 if (f->metrics.max_size > 0 && new_size > f->metrics.max_size)
360 new_size = f->metrics.max_size;
361
bc85bfee
LP
362 /* Note that the glibc fallocate() fallback is very
363 inefficient, hence we try to minimize the allocation area
364 as we can. */
fec2aa2f
GV
365 r = posix_fallocate(f->fd, old_size, new_size - old_size);
366 if (r != 0)
367 return -r;
cec736d2 368
eda4b58b
LP
369 if (fstat(f->fd, &f->last_stat) < 0)
370 return -errno;
cec736d2 371
23b0b2b2 372 f->header->arena_size = htole64(new_size - le64toh(f->header->header_size));
cec736d2
LP
373
374 return 0;
375}
376
fcde2389 377static int journal_file_move_to(JournalFile *f, int context, bool keep_always, uint64_t offset, uint64_t size, void **ret) {
cec736d2 378 assert(f);
cec736d2
LP
379 assert(ret);
380
7762e02b
LP
381 if (size <= 0)
382 return -EINVAL;
383
2a59ea54 384 /* Avoid SIGBUS on invalid accesses */
4bbdcdb3
LP
385 if (offset + size > (uint64_t) f->last_stat.st_size) {
386 /* Hmm, out of range? Let's refresh the fstat() data
387 * first, before we trust that check. */
388
389 if (fstat(f->fd, &f->last_stat) < 0 ||
390 offset + size > (uint64_t) f->last_stat.st_size)
391 return -EADDRNOTAVAIL;
392 }
393
06cc69d4 394 return mmap_cache_get(f->mmap, f->fd, f->prot, context, keep_always, offset, size, &f->last_stat, ret, NULL);
cec736d2
LP
395}
396
16e9f408
LP
397static uint64_t minimum_header_size(Object *o) {
398
b8e891e6 399 static const uint64_t table[] = {
16e9f408
LP
400 [OBJECT_DATA] = sizeof(DataObject),
401 [OBJECT_FIELD] = sizeof(FieldObject),
402 [OBJECT_ENTRY] = sizeof(EntryObject),
403 [OBJECT_DATA_HASH_TABLE] = sizeof(HashTableObject),
404 [OBJECT_FIELD_HASH_TABLE] = sizeof(HashTableObject),
405 [OBJECT_ENTRY_ARRAY] = sizeof(EntryArrayObject),
406 [OBJECT_TAG] = sizeof(TagObject),
407 };
408
409 if (o->object.type >= ELEMENTSOF(table) || table[o->object.type] <= 0)
410 return sizeof(ObjectHeader);
411
412 return table[o->object.type];
413}
414
de190aef 415int journal_file_move_to_object(JournalFile *f, int type, uint64_t offset, Object **ret) {
cec736d2
LP
416 int r;
417 void *t;
418 Object *o;
419 uint64_t s;
420
421 assert(f);
422 assert(ret);
423
db11ac1a
LP
424 /* Objects may only be located at multiple of 64 bit */
425 if (!VALID64(offset))
426 return -EFAULT;
427
ae97089d 428 r = journal_file_move_to(f, type_to_context(type), false, offset, sizeof(ObjectHeader), &t);
cec736d2
LP
429 if (r < 0)
430 return r;
431
432 o = (Object*) t;
433 s = le64toh(o->object.size);
434
435 if (s < sizeof(ObjectHeader))
436 return -EBADMSG;
437
16e9f408
LP
438 if (o->object.type <= OBJECT_UNUSED)
439 return -EBADMSG;
440
441 if (s < minimum_header_size(o))
442 return -EBADMSG;
443
3c1668da 444 if (type > 0 && o->object.type != type)
cec736d2
LP
445 return -EBADMSG;
446
447 if (s > sizeof(ObjectHeader)) {
fcde2389 448 r = journal_file_move_to(f, o->object.type, false, offset, s, &t);
cec736d2
LP
449 if (r < 0)
450 return r;
451
452 o = (Object*) t;
453 }
454
cec736d2
LP
455 *ret = o;
456 return 0;
457}
458
d98cc1f2 459static uint64_t journal_file_entry_seqnum(JournalFile *f, uint64_t *seqnum) {
cec736d2
LP
460 uint64_t r;
461
462 assert(f);
463
beec0085 464 r = le64toh(f->header->tail_entry_seqnum) + 1;
c2373f84
LP
465
466 if (seqnum) {
de190aef 467 /* If an external seqnum counter was passed, we update
c2373f84
LP
468 * both the local and the external one, and set it to
469 * the maximum of both */
470
471 if (*seqnum + 1 > r)
472 r = *seqnum + 1;
473
474 *seqnum = r;
475 }
476
beec0085 477 f->header->tail_entry_seqnum = htole64(r);
cec736d2 478
beec0085
LP
479 if (f->header->head_entry_seqnum == 0)
480 f->header->head_entry_seqnum = htole64(r);
de190aef 481
cec736d2
LP
482 return r;
483}
484
0284adc6 485int journal_file_append_object(JournalFile *f, int type, uint64_t size, Object **ret, uint64_t *offset) {
cec736d2
LP
486 int r;
487 uint64_t p;
488 Object *tail, *o;
489 void *t;
490
491 assert(f);
16e9f408 492 assert(type > 0 && type < _OBJECT_TYPE_MAX);
cec736d2
LP
493 assert(size >= sizeof(ObjectHeader));
494 assert(offset);
495 assert(ret);
496
26687bf8
OS
497 r = journal_file_set_online(f);
498 if (r < 0)
499 return r;
500
cec736d2 501 p = le64toh(f->header->tail_object_offset);
cec736d2 502 if (p == 0)
23b0b2b2 503 p = le64toh(f->header->header_size);
cec736d2 504 else {
de190aef 505 r = journal_file_move_to_object(f, -1, p, &tail);
cec736d2
LP
506 if (r < 0)
507 return r;
508
509 p += ALIGN64(le64toh(tail->object.size));
510 }
511
512 r = journal_file_allocate(f, p, size);
513 if (r < 0)
514 return r;
515
fcde2389 516 r = journal_file_move_to(f, type, false, p, size, &t);
cec736d2
LP
517 if (r < 0)
518 return r;
519
520 o = (Object*) t;
521
522 zero(o->object);
de190aef 523 o->object.type = type;
cec736d2
LP
524 o->object.size = htole64(size);
525
526 f->header->tail_object_offset = htole64(p);
cec736d2
LP
527 f->header->n_objects = htole64(le64toh(f->header->n_objects) + 1);
528
529 *ret = o;
530 *offset = p;
531
532 return 0;
533}
534
de190aef 535static int journal_file_setup_data_hash_table(JournalFile *f) {
cec736d2
LP
536 uint64_t s, p;
537 Object *o;
538 int r;
539
540 assert(f);
541
dfabe643 542 /* We estimate that we need 1 hash table entry per 768 of
4a92baf3
LP
543 journal file and we want to make sure we never get beyond
544 75% fill level. Calculate the hash table size for the
545 maximum file size based on these metrics. */
546
dfabe643 547 s = (f->metrics.max_size * 4 / 768 / 3) * sizeof(HashItem);
4a92baf3
LP
548 if (s < DEFAULT_DATA_HASH_TABLE_SIZE)
549 s = DEFAULT_DATA_HASH_TABLE_SIZE;
550
507f22bd 551 log_debug("Reserving %"PRIu64" entries in hash table.", s / sizeof(HashItem));
4a92baf3 552
de190aef
LP
553 r = journal_file_append_object(f,
554 OBJECT_DATA_HASH_TABLE,
555 offsetof(Object, hash_table.items) + s,
556 &o, &p);
cec736d2
LP
557 if (r < 0)
558 return r;
559
29804cc1 560 memzero(o->hash_table.items, s);
cec736d2 561
de190aef
LP
562 f->header->data_hash_table_offset = htole64(p + offsetof(Object, hash_table.items));
563 f->header->data_hash_table_size = htole64(s);
cec736d2
LP
564
565 return 0;
566}
567
de190aef 568static int journal_file_setup_field_hash_table(JournalFile *f) {
cec736d2
LP
569 uint64_t s, p;
570 Object *o;
571 int r;
572
573 assert(f);
574
3c1668da
LP
575 /* We use a fixed size hash table for the fields as this
576 * number should grow very slowly only */
577
de190aef
LP
578 s = DEFAULT_FIELD_HASH_TABLE_SIZE;
579 r = journal_file_append_object(f,
580 OBJECT_FIELD_HASH_TABLE,
581 offsetof(Object, hash_table.items) + s,
582 &o, &p);
cec736d2
LP
583 if (r < 0)
584 return r;
585
29804cc1 586 memzero(o->hash_table.items, s);
cec736d2 587
de190aef
LP
588 f->header->field_hash_table_offset = htole64(p + offsetof(Object, hash_table.items));
589 f->header->field_hash_table_size = htole64(s);
cec736d2
LP
590
591 return 0;
592}
593
de190aef 594static int journal_file_map_data_hash_table(JournalFile *f) {
cec736d2
LP
595 uint64_t s, p;
596 void *t;
597 int r;
598
599 assert(f);
600
de190aef
LP
601 p = le64toh(f->header->data_hash_table_offset);
602 s = le64toh(f->header->data_hash_table_size);
cec736d2 603
de190aef 604 r = journal_file_move_to(f,
16e9f408 605 OBJECT_DATA_HASH_TABLE,
fcde2389 606 true,
de190aef
LP
607 p, s,
608 &t);
cec736d2
LP
609 if (r < 0)
610 return r;
611
de190aef 612 f->data_hash_table = t;
cec736d2
LP
613 return 0;
614}
615
de190aef 616static int journal_file_map_field_hash_table(JournalFile *f) {
cec736d2
LP
617 uint64_t s, p;
618 void *t;
619 int r;
620
621 assert(f);
622
de190aef
LP
623 p = le64toh(f->header->field_hash_table_offset);
624 s = le64toh(f->header->field_hash_table_size);
cec736d2 625
de190aef 626 r = journal_file_move_to(f,
16e9f408 627 OBJECT_FIELD_HASH_TABLE,
fcde2389 628 true,
de190aef
LP
629 p, s,
630 &t);
cec736d2
LP
631 if (r < 0)
632 return r;
633
de190aef 634 f->field_hash_table = t;
cec736d2
LP
635 return 0;
636}
637
3c1668da
LP
638static int journal_file_link_field(
639 JournalFile *f,
640 Object *o,
641 uint64_t offset,
642 uint64_t hash) {
643
644 uint64_t p, h;
645 int r;
646
647 assert(f);
648 assert(o);
649 assert(offset > 0);
650
651 if (o->object.type != OBJECT_FIELD)
652 return -EINVAL;
653
654 /* This might alter the window we are looking at */
655
656 o->field.next_hash_offset = o->field.head_data_offset = 0;
657
658 h = hash % (le64toh(f->header->field_hash_table_size) / sizeof(HashItem));
659 p = le64toh(f->field_hash_table[h].tail_hash_offset);
660 if (p == 0)
661 f->field_hash_table[h].head_hash_offset = htole64(offset);
662 else {
663 r = journal_file_move_to_object(f, OBJECT_FIELD, p, &o);
664 if (r < 0)
665 return r;
666
667 o->field.next_hash_offset = htole64(offset);
668 }
669
670 f->field_hash_table[h].tail_hash_offset = htole64(offset);
671
672 if (JOURNAL_HEADER_CONTAINS(f->header, n_fields))
673 f->header->n_fields = htole64(le64toh(f->header->n_fields) + 1);
674
675 return 0;
676}
677
678static int journal_file_link_data(
679 JournalFile *f,
680 Object *o,
681 uint64_t offset,
682 uint64_t hash) {
683
de190aef 684 uint64_t p, h;
cec736d2
LP
685 int r;
686
687 assert(f);
688 assert(o);
689 assert(offset > 0);
b588975f
LP
690
691 if (o->object.type != OBJECT_DATA)
692 return -EINVAL;
cec736d2 693
48496df6
LP
694 /* This might alter the window we are looking at */
695
de190aef
LP
696 o->data.next_hash_offset = o->data.next_field_offset = 0;
697 o->data.entry_offset = o->data.entry_array_offset = 0;
698 o->data.n_entries = 0;
cec736d2 699
de190aef 700 h = hash % (le64toh(f->header->data_hash_table_size) / sizeof(HashItem));
8db4213e 701 p = le64toh(f->data_hash_table[h].tail_hash_offset);
3c1668da 702 if (p == 0)
cec736d2 703 /* Only entry in the hash table is easy */
de190aef 704 f->data_hash_table[h].head_hash_offset = htole64(offset);
3c1668da 705 else {
48496df6
LP
706 /* Move back to the previous data object, to patch in
707 * pointer */
cec736d2 708
de190aef 709 r = journal_file_move_to_object(f, OBJECT_DATA, p, &o);
cec736d2
LP
710 if (r < 0)
711 return r;
712
de190aef 713 o->data.next_hash_offset = htole64(offset);
cec736d2
LP
714 }
715
de190aef 716 f->data_hash_table[h].tail_hash_offset = htole64(offset);
cec736d2 717
dca6219e
LP
718 if (JOURNAL_HEADER_CONTAINS(f->header, n_data))
719 f->header->n_data = htole64(le64toh(f->header->n_data) + 1);
720
cec736d2
LP
721 return 0;
722}
723
3c1668da
LP
724int journal_file_find_field_object_with_hash(
725 JournalFile *f,
726 const void *field, uint64_t size, uint64_t hash,
727 Object **ret, uint64_t *offset) {
728
729 uint64_t p, osize, h;
730 int r;
731
732 assert(f);
733 assert(field && size > 0);
734
735 osize = offsetof(Object, field.payload) + size;
736
737 if (f->header->field_hash_table_size == 0)
738 return -EBADMSG;
739
740 h = hash % (le64toh(f->header->field_hash_table_size) / sizeof(HashItem));
741 p = le64toh(f->field_hash_table[h].head_hash_offset);
742
743 while (p > 0) {
744 Object *o;
745
746 r = journal_file_move_to_object(f, OBJECT_FIELD, p, &o);
747 if (r < 0)
748 return r;
749
750 if (le64toh(o->field.hash) == hash &&
751 le64toh(o->object.size) == osize &&
752 memcmp(o->field.payload, field, size) == 0) {
753
754 if (ret)
755 *ret = o;
756 if (offset)
757 *offset = p;
758
759 return 1;
760 }
761
762 p = le64toh(o->field.next_hash_offset);
763 }
764
765 return 0;
766}
767
768int journal_file_find_field_object(
769 JournalFile *f,
770 const void *field, uint64_t size,
771 Object **ret, uint64_t *offset) {
772
773 uint64_t hash;
774
775 assert(f);
776 assert(field && size > 0);
777
778 hash = hash64(field, size);
779
780 return journal_file_find_field_object_with_hash(f,
781 field, size, hash,
782 ret, offset);
783}
784
de190aef
LP
785int journal_file_find_data_object_with_hash(
786 JournalFile *f,
787 const void *data, uint64_t size, uint64_t hash,
788 Object **ret, uint64_t *offset) {
48496df6 789
de190aef 790 uint64_t p, osize, h;
cec736d2
LP
791 int r;
792
793 assert(f);
794 assert(data || size == 0);
795
796 osize = offsetof(Object, data.payload) + size;
797
bc85bfee
LP
798 if (f->header->data_hash_table_size == 0)
799 return -EBADMSG;
800
de190aef
LP
801 h = hash % (le64toh(f->header->data_hash_table_size) / sizeof(HashItem));
802 p = le64toh(f->data_hash_table[h].head_hash_offset);
cec736d2 803
de190aef
LP
804 while (p > 0) {
805 Object *o;
cec736d2 806
de190aef 807 r = journal_file_move_to_object(f, OBJECT_DATA, p, &o);
cec736d2
LP
808 if (r < 0)
809 return r;
810
807e17f0 811 if (le64toh(o->data.hash) != hash)
85a131e8 812 goto next;
807e17f0 813
d89c8fdf 814 if (o->object.flags & OBJECT_COMPRESSION_MASK) {
3b1a55e1 815#if defined(HAVE_XZ) || defined(HAVE_LZ4)
fa1c4b51
ZJS
816 uint64_t l;
817 size_t rsize;
cec736d2 818
807e17f0
LP
819 l = le64toh(o->object.size);
820 if (l <= offsetof(Object, data.payload))
cec736d2
LP
821 return -EBADMSG;
822
807e17f0
LP
823 l -= offsetof(Object, data.payload);
824
d89c8fdf
ZJS
825 r = decompress_blob(o->object.flags & OBJECT_COMPRESSION_MASK,
826 o->data.payload, l, &f->compress_buffer, &f->compress_buffer_size, &rsize, 0);
827 if (r < 0)
828 return r;
807e17f0 829
b785c858 830 if (rsize == size &&
807e17f0
LP
831 memcmp(f->compress_buffer, data, size) == 0) {
832
833 if (ret)
834 *ret = o;
835
836 if (offset)
837 *offset = p;
838
839 return 1;
840 }
3b1a55e1
ZJS
841#else
842 return -EPROTONOSUPPORT;
843#endif
807e17f0
LP
844 } else if (le64toh(o->object.size) == osize &&
845 memcmp(o->data.payload, data, size) == 0) {
846
cec736d2
LP
847 if (ret)
848 *ret = o;
849
850 if (offset)
851 *offset = p;
852
de190aef 853 return 1;
cec736d2
LP
854 }
855
85a131e8 856 next:
cec736d2
LP
857 p = le64toh(o->data.next_hash_offset);
858 }
859
de190aef
LP
860 return 0;
861}
862
863int journal_file_find_data_object(
864 JournalFile *f,
865 const void *data, uint64_t size,
866 Object **ret, uint64_t *offset) {
867
868 uint64_t hash;
869
870 assert(f);
871 assert(data || size == 0);
872
873 hash = hash64(data, size);
874
875 return journal_file_find_data_object_with_hash(f,
876 data, size, hash,
877 ret, offset);
878}
879
3c1668da
LP
880static int journal_file_append_field(
881 JournalFile *f,
882 const void *field, uint64_t size,
883 Object **ret, uint64_t *offset) {
884
885 uint64_t hash, p;
886 uint64_t osize;
887 Object *o;
888 int r;
889
890 assert(f);
891 assert(field && size > 0);
892
893 hash = hash64(field, size);
894
895 r = journal_file_find_field_object_with_hash(f, field, size, hash, &o, &p);
896 if (r < 0)
897 return r;
898 else if (r > 0) {
899
900 if (ret)
901 *ret = o;
902
903 if (offset)
904 *offset = p;
905
906 return 0;
907 }
908
909 osize = offsetof(Object, field.payload) + size;
910 r = journal_file_append_object(f, OBJECT_FIELD, osize, &o, &p);
8c92d4bb
LP
911 if (r < 0)
912 return r;
3c1668da
LP
913
914 o->field.hash = htole64(hash);
915 memcpy(o->field.payload, field, size);
916
917 r = journal_file_link_field(f, o, p, hash);
918 if (r < 0)
919 return r;
920
921 /* The linking might have altered the window, so let's
922 * refresh our pointer */
923 r = journal_file_move_to_object(f, OBJECT_FIELD, p, &o);
924 if (r < 0)
925 return r;
926
927#ifdef HAVE_GCRYPT
928 r = journal_file_hmac_put_object(f, OBJECT_FIELD, o, p);
929 if (r < 0)
930 return r;
931#endif
932
933 if (ret)
934 *ret = o;
935
936 if (offset)
937 *offset = p;
938
939 return 0;
940}
941
48496df6
LP
942static int journal_file_append_data(
943 JournalFile *f,
944 const void *data, uint64_t size,
945 Object **ret, uint64_t *offset) {
946
de190aef
LP
947 uint64_t hash, p;
948 uint64_t osize;
949 Object *o;
d89c8fdf 950 int r, compression = 0;
3c1668da 951 const void *eq;
de190aef
LP
952
953 assert(f);
954 assert(data || size == 0);
955
956 hash = hash64(data, size);
957
958 r = journal_file_find_data_object_with_hash(f, data, size, hash, &o, &p);
959 if (r < 0)
960 return r;
961 else if (r > 0) {
962
963 if (ret)
964 *ret = o;
965
966 if (offset)
967 *offset = p;
968
969 return 0;
970 }
971
972 osize = offsetof(Object, data.payload) + size;
973 r = journal_file_append_object(f, OBJECT_DATA, osize, &o, &p);
cec736d2
LP
974 if (r < 0)
975 return r;
976
cec736d2 977 o->data.hash = htole64(hash);
807e17f0 978
d89c8fdf
ZJS
979#if defined(HAVE_XZ) || defined(HAVE_LZ4)
980 if (f->compress_xz &&
807e17f0 981 size >= COMPRESSION_SIZE_THRESHOLD) {
fa1c4b51 982 size_t rsize;
807e17f0 983
d89c8fdf 984 compression = compress_blob(data, size, o->data.payload, &rsize);
807e17f0 985
d89c8fdf 986 if (compression) {
807e17f0 987 o->object.size = htole64(offsetof(Object, data.payload) + rsize);
d89c8fdf 988 o->object.flags |= compression;
807e17f0 989
fa1c4b51 990 log_debug("Compressed data object %"PRIu64" -> %zu using %s",
d89c8fdf 991 size, rsize, object_compressed_to_string(compression));
807e17f0
LP
992 }
993 }
994#endif
995
d89c8fdf 996 if (!compression && size > 0)
807e17f0 997 memcpy(o->data.payload, data, size);
cec736d2 998
de190aef 999 r = journal_file_link_data(f, o, p, hash);
cec736d2
LP
1000 if (r < 0)
1001 return r;
1002
48496df6
LP
1003 /* The linking might have altered the window, so let's
1004 * refresh our pointer */
1005 r = journal_file_move_to_object(f, OBJECT_DATA, p, &o);
1006 if (r < 0)
1007 return r;
1008
08c6f819
SL
1009 if (!data)
1010 eq = NULL;
1011 else
1012 eq = memchr(data, '=', size);
3c1668da 1013 if (eq && eq > data) {
748db592 1014 Object *fo = NULL;
3c1668da 1015 uint64_t fp;
3c1668da
LP
1016
1017 /* Create field object ... */
1018 r = journal_file_append_field(f, data, (uint8_t*) eq - (uint8_t*) data, &fo, &fp);
1019 if (r < 0)
1020 return r;
1021
1022 /* ... and link it in. */
1023 o->data.next_field_offset = fo->field.head_data_offset;
1024 fo->field.head_data_offset = le64toh(p);
1025 }
1026
5996c7c2
LP
1027#ifdef HAVE_GCRYPT
1028 r = journal_file_hmac_put_object(f, OBJECT_DATA, o, p);
1029 if (r < 0)
1030 return r;
1031#endif
1032
cec736d2
LP
1033 if (ret)
1034 *ret = o;
1035
1036 if (offset)
de190aef 1037 *offset = p;
cec736d2
LP
1038
1039 return 0;
1040}
1041
1042uint64_t journal_file_entry_n_items(Object *o) {
1043 assert(o);
b588975f
LP
1044
1045 if (o->object.type != OBJECT_ENTRY)
1046 return 0;
cec736d2
LP
1047
1048 return (le64toh(o->object.size) - offsetof(Object, entry.items)) / sizeof(EntryItem);
1049}
1050
0284adc6 1051uint64_t journal_file_entry_array_n_items(Object *o) {
de190aef 1052 assert(o);
b588975f
LP
1053
1054 if (o->object.type != OBJECT_ENTRY_ARRAY)
1055 return 0;
de190aef
LP
1056
1057 return (le64toh(o->object.size) - offsetof(Object, entry_array.items)) / sizeof(uint64_t);
1058}
1059
fb9a24b6
LP
1060uint64_t journal_file_hash_table_n_items(Object *o) {
1061 assert(o);
b588975f
LP
1062
1063 if (o->object.type != OBJECT_DATA_HASH_TABLE &&
1064 o->object.type != OBJECT_FIELD_HASH_TABLE)
1065 return 0;
fb9a24b6
LP
1066
1067 return (le64toh(o->object.size) - offsetof(Object, hash_table.items)) / sizeof(HashItem);
1068}
1069
de190aef 1070static int link_entry_into_array(JournalFile *f,
4fd052ae
FC
1071 le64_t *first,
1072 le64_t *idx,
de190aef 1073 uint64_t p) {
cec736d2 1074 int r;
de190aef
LP
1075 uint64_t n = 0, ap = 0, q, i, a, hidx;
1076 Object *o;
1077
cec736d2 1078 assert(f);
de190aef
LP
1079 assert(first);
1080 assert(idx);
1081 assert(p > 0);
cec736d2 1082
de190aef
LP
1083 a = le64toh(*first);
1084 i = hidx = le64toh(*idx);
1085 while (a > 0) {
1086
1087 r = journal_file_move_to_object(f, OBJECT_ENTRY_ARRAY, a, &o);
1088 if (r < 0)
1089 return r;
cec736d2 1090
de190aef
LP
1091 n = journal_file_entry_array_n_items(o);
1092 if (i < n) {
1093 o->entry_array.items[i] = htole64(p);
1094 *idx = htole64(hidx + 1);
1095 return 0;
1096 }
cec736d2 1097
de190aef
LP
1098 i -= n;
1099 ap = a;
1100 a = le64toh(o->entry_array.next_entry_array_offset);
1101 }
1102
1103 if (hidx > n)
1104 n = (hidx+1) * 2;
1105 else
1106 n = n * 2;
1107
1108 if (n < 4)
1109 n = 4;
1110
1111 r = journal_file_append_object(f, OBJECT_ENTRY_ARRAY,
1112 offsetof(Object, entry_array.items) + n * sizeof(uint64_t),
1113 &o, &q);
cec736d2
LP
1114 if (r < 0)
1115 return r;
1116
feb12d3e 1117#ifdef HAVE_GCRYPT
5996c7c2 1118 r = journal_file_hmac_put_object(f, OBJECT_ENTRY_ARRAY, o, q);
b0af6f41
LP
1119 if (r < 0)
1120 return r;
feb12d3e 1121#endif
b0af6f41 1122
de190aef 1123 o->entry_array.items[i] = htole64(p);
cec736d2 1124
de190aef 1125 if (ap == 0)
7be3aa17 1126 *first = htole64(q);
cec736d2 1127 else {
de190aef 1128 r = journal_file_move_to_object(f, OBJECT_ENTRY_ARRAY, ap, &o);
cec736d2
LP
1129 if (r < 0)
1130 return r;
1131
de190aef
LP
1132 o->entry_array.next_entry_array_offset = htole64(q);
1133 }
cec736d2 1134
2dee23eb
LP
1135 if (JOURNAL_HEADER_CONTAINS(f->header, n_entry_arrays))
1136 f->header->n_entry_arrays = htole64(le64toh(f->header->n_entry_arrays) + 1);
1137
de190aef
LP
1138 *idx = htole64(hidx + 1);
1139
1140 return 0;
1141}
cec736d2 1142
de190aef 1143static int link_entry_into_array_plus_one(JournalFile *f,
4fd052ae
FC
1144 le64_t *extra,
1145 le64_t *first,
1146 le64_t *idx,
de190aef
LP
1147 uint64_t p) {
1148
1149 int r;
1150
1151 assert(f);
1152 assert(extra);
1153 assert(first);
1154 assert(idx);
1155 assert(p > 0);
1156
1157 if (*idx == 0)
1158 *extra = htole64(p);
1159 else {
4fd052ae 1160 le64_t i;
de190aef 1161
7be3aa17 1162 i = htole64(le64toh(*idx) - 1);
de190aef
LP
1163 r = link_entry_into_array(f, first, &i, p);
1164 if (r < 0)
1165 return r;
cec736d2
LP
1166 }
1167
de190aef
LP
1168 *idx = htole64(le64toh(*idx) + 1);
1169 return 0;
1170}
1171
1172static int journal_file_link_entry_item(JournalFile *f, Object *o, uint64_t offset, uint64_t i) {
1173 uint64_t p;
1174 int r;
1175 assert(f);
1176 assert(o);
1177 assert(offset > 0);
1178
1179 p = le64toh(o->entry.items[i].object_offset);
1180 if (p == 0)
1181 return -EINVAL;
1182
1183 r = journal_file_move_to_object(f, OBJECT_DATA, p, &o);
cec736d2
LP
1184 if (r < 0)
1185 return r;
1186
de190aef
LP
1187 return link_entry_into_array_plus_one(f,
1188 &o->data.entry_offset,
1189 &o->data.entry_array_offset,
1190 &o->data.n_entries,
1191 offset);
cec736d2
LP
1192}
1193
1194static int journal_file_link_entry(JournalFile *f, Object *o, uint64_t offset) {
de190aef 1195 uint64_t n, i;
cec736d2
LP
1196 int r;
1197
1198 assert(f);
1199 assert(o);
1200 assert(offset > 0);
b588975f
LP
1201
1202 if (o->object.type != OBJECT_ENTRY)
1203 return -EINVAL;
cec736d2 1204
b788cc23
LP
1205 __sync_synchronize();
1206
cec736d2 1207 /* Link up the entry itself */
de190aef
LP
1208 r = link_entry_into_array(f,
1209 &f->header->entry_array_offset,
1210 &f->header->n_entries,
1211 offset);
1212 if (r < 0)
1213 return r;
cec736d2 1214
507f22bd 1215 /* log_debug("=> %s seqnr=%"PRIu64" n_entries=%"PRIu64, f->path, o->entry.seqnum, f->header->n_entries); */
cec736d2 1216
de190aef 1217 if (f->header->head_entry_realtime == 0)
0ac38b70 1218 f->header->head_entry_realtime = o->entry.realtime;
cec736d2 1219
0ac38b70 1220 f->header->tail_entry_realtime = o->entry.realtime;
de190aef
LP
1221 f->header->tail_entry_monotonic = o->entry.monotonic;
1222
1223 f->tail_entry_monotonic_valid = true;
cec736d2
LP
1224
1225 /* Link up the items */
1226 n = journal_file_entry_n_items(o);
1227 for (i = 0; i < n; i++) {
1228 r = journal_file_link_entry_item(f, o, offset, i);
1229 if (r < 0)
1230 return r;
1231 }
1232
cec736d2
LP
1233 return 0;
1234}
1235
1236static int journal_file_append_entry_internal(
1237 JournalFile *f,
1238 const dual_timestamp *ts,
1239 uint64_t xor_hash,
1240 const EntryItem items[], unsigned n_items,
de190aef 1241 uint64_t *seqnum,
cec736d2
LP
1242 Object **ret, uint64_t *offset) {
1243 uint64_t np;
1244 uint64_t osize;
1245 Object *o;
1246 int r;
1247
1248 assert(f);
1249 assert(items || n_items == 0);
de190aef 1250 assert(ts);
cec736d2
LP
1251
1252 osize = offsetof(Object, entry.items) + (n_items * sizeof(EntryItem));
1253
de190aef 1254 r = journal_file_append_object(f, OBJECT_ENTRY, osize, &o, &np);
cec736d2
LP
1255 if (r < 0)
1256 return r;
1257
d98cc1f2 1258 o->entry.seqnum = htole64(journal_file_entry_seqnum(f, seqnum));
cec736d2 1259 memcpy(o->entry.items, items, n_items * sizeof(EntryItem));
de190aef
LP
1260 o->entry.realtime = htole64(ts->realtime);
1261 o->entry.monotonic = htole64(ts->monotonic);
cec736d2
LP
1262 o->entry.xor_hash = htole64(xor_hash);
1263 o->entry.boot_id = f->header->boot_id;
1264
feb12d3e 1265#ifdef HAVE_GCRYPT
5996c7c2 1266 r = journal_file_hmac_put_object(f, OBJECT_ENTRY, o, np);
b0af6f41
LP
1267 if (r < 0)
1268 return r;
feb12d3e 1269#endif
b0af6f41 1270
cec736d2
LP
1271 r = journal_file_link_entry(f, o, np);
1272 if (r < 0)
1273 return r;
1274
1275 if (ret)
1276 *ret = o;
1277
1278 if (offset)
1279 *offset = np;
1280
1281 return 0;
1282}
1283
cf244689 1284void journal_file_post_change(JournalFile *f) {
50f20cfd
LP
1285 assert(f);
1286
1287 /* inotify() does not receive IN_MODIFY events from file
1288 * accesses done via mmap(). After each access we hence
1289 * trigger IN_MODIFY by truncating the journal file to its
1290 * current size which triggers IN_MODIFY. */
1291
bc85bfee
LP
1292 __sync_synchronize();
1293
50f20cfd 1294 if (ftruncate(f->fd, f->last_stat.st_size) < 0)
c5315881 1295 log_error("Failed to truncate file to its own size: %m");
50f20cfd
LP
1296}
1297
1f2da9ec
LP
1298static int entry_item_cmp(const void *_a, const void *_b) {
1299 const EntryItem *a = _a, *b = _b;
1300
1301 if (le64toh(a->object_offset) < le64toh(b->object_offset))
1302 return -1;
1303 if (le64toh(a->object_offset) > le64toh(b->object_offset))
1304 return 1;
1305 return 0;
1306}
1307
de190aef 1308int journal_file_append_entry(JournalFile *f, const dual_timestamp *ts, const struct iovec iovec[], unsigned n_iovec, uint64_t *seqnum, Object **ret, uint64_t *offset) {
cec736d2
LP
1309 unsigned i;
1310 EntryItem *items;
1311 int r;
1312 uint64_t xor_hash = 0;
de190aef 1313 struct dual_timestamp _ts;
cec736d2
LP
1314
1315 assert(f);
1316 assert(iovec || n_iovec == 0);
1317
de190aef
LP
1318 if (!ts) {
1319 dual_timestamp_get(&_ts);
1320 ts = &_ts;
1321 }
1322
1323 if (f->tail_entry_monotonic_valid &&
1324 ts->monotonic < le64toh(f->header->tail_entry_monotonic))
1325 return -EINVAL;
1326
feb12d3e 1327#ifdef HAVE_GCRYPT
7560fffc
LP
1328 r = journal_file_maybe_append_tag(f, ts->realtime);
1329 if (r < 0)
1330 return r;
feb12d3e 1331#endif
7560fffc 1332
64825d3c 1333 /* alloca() can't take 0, hence let's allocate at least one */
9607d947 1334 items = alloca(sizeof(EntryItem) * MAX(1u, n_iovec));
cec736d2
LP
1335
1336 for (i = 0; i < n_iovec; i++) {
1337 uint64_t p;
1338 Object *o;
1339
1340 r = journal_file_append_data(f, iovec[i].iov_base, iovec[i].iov_len, &o, &p);
1341 if (r < 0)
cf244689 1342 return r;
cec736d2
LP
1343
1344 xor_hash ^= le64toh(o->data.hash);
1345 items[i].object_offset = htole64(p);
de7b95cd 1346 items[i].hash = o->data.hash;
cec736d2
LP
1347 }
1348
1f2da9ec
LP
1349 /* Order by the position on disk, in order to improve seek
1350 * times for rotating media. */
7ff7394d 1351 qsort_safe(items, n_iovec, sizeof(EntryItem), entry_item_cmp);
1f2da9ec 1352
de190aef 1353 r = journal_file_append_entry_internal(f, ts, xor_hash, items, n_iovec, seqnum, ret, offset);
cec736d2 1354
50f20cfd
LP
1355 journal_file_post_change(f);
1356
cec736d2
LP
1357 return r;
1358}
1359
a4bcff5b 1360typedef struct ChainCacheItem {
fb099c8d 1361 uint64_t first; /* the array at the beginning of the chain */
a4bcff5b
LP
1362 uint64_t array; /* the cached array */
1363 uint64_t begin; /* the first item in the cached array */
1364 uint64_t total; /* the total number of items in all arrays before this one in the chain */
f268980d 1365 uint64_t last_index; /* the last index we looked at, to optimize locality when bisecting */
a4bcff5b
LP
1366} ChainCacheItem;
1367
1368static void chain_cache_put(
1369 Hashmap *h,
1370 ChainCacheItem *ci,
1371 uint64_t first,
1372 uint64_t array,
1373 uint64_t begin,
f268980d
LP
1374 uint64_t total,
1375 uint64_t last_index) {
a4bcff5b
LP
1376
1377 if (!ci) {
34741aa3
LP
1378 /* If the chain item to cache for this chain is the
1379 * first one it's not worth caching anything */
1380 if (array == first)
1381 return;
1382
a4bcff5b
LP
1383 if (hashmap_size(h) >= CHAIN_CACHE_MAX)
1384 ci = hashmap_steal_first(h);
1385 else {
1386 ci = new(ChainCacheItem, 1);
1387 if (!ci)
1388 return;
1389 }
1390
1391 ci->first = first;
1392
1393 if (hashmap_put(h, &ci->first, ci) < 0) {
1394 free(ci);
1395 return;
1396 }
1397 } else
1398 assert(ci->first == first);
1399
1400 ci->array = array;
1401 ci->begin = begin;
1402 ci->total = total;
f268980d 1403 ci->last_index = last_index;
a4bcff5b
LP
1404}
1405
f268980d
LP
1406static int generic_array_get(
1407 JournalFile *f,
1408 uint64_t first,
1409 uint64_t i,
1410 Object **ret, uint64_t *offset) {
de190aef 1411
cec736d2 1412 Object *o;
a4bcff5b 1413 uint64_t p = 0, a, t = 0;
cec736d2 1414 int r;
a4bcff5b 1415 ChainCacheItem *ci;
cec736d2
LP
1416
1417 assert(f);
1418
de190aef 1419 a = first;
a4bcff5b
LP
1420
1421 /* Try the chain cache first */
1422 ci = hashmap_get(f->chain_cache, &first);
1423 if (ci && i > ci->total) {
1424 a = ci->array;
1425 i -= ci->total;
1426 t = ci->total;
1427 }
1428
de190aef 1429 while (a > 0) {
a4bcff5b 1430 uint64_t k;
cec736d2 1431
de190aef
LP
1432 r = journal_file_move_to_object(f, OBJECT_ENTRY_ARRAY, a, &o);
1433 if (r < 0)
1434 return r;
cec736d2 1435
a4bcff5b
LP
1436 k = journal_file_entry_array_n_items(o);
1437 if (i < k) {
de190aef 1438 p = le64toh(o->entry_array.items[i]);
a4bcff5b 1439 goto found;
cec736d2
LP
1440 }
1441
a4bcff5b
LP
1442 i -= k;
1443 t += k;
de190aef
LP
1444 a = le64toh(o->entry_array.next_entry_array_offset);
1445 }
1446
a4bcff5b
LP
1447 return 0;
1448
1449found:
1450 /* Let's cache this item for the next invocation */
af13a6b0 1451 chain_cache_put(f->chain_cache, ci, first, a, le64toh(o->entry_array.items[0]), t, i);
de190aef
LP
1452
1453 r = journal_file_move_to_object(f, OBJECT_ENTRY, p, &o);
1454 if (r < 0)
1455 return r;
1456
1457 if (ret)
1458 *ret = o;
1459
1460 if (offset)
1461 *offset = p;
1462
1463 return 1;
1464}
1465
f268980d
LP
1466static int generic_array_get_plus_one(
1467 JournalFile *f,
1468 uint64_t extra,
1469 uint64_t first,
1470 uint64_t i,
1471 Object **ret, uint64_t *offset) {
de190aef
LP
1472
1473 Object *o;
1474
1475 assert(f);
1476
1477 if (i == 0) {
1478 int r;
1479
1480 r = journal_file_move_to_object(f, OBJECT_ENTRY, extra, &o);
cec736d2
LP
1481 if (r < 0)
1482 return r;
1483
de190aef
LP
1484 if (ret)
1485 *ret = o;
cec736d2 1486
de190aef
LP
1487 if (offset)
1488 *offset = extra;
cec736d2 1489
de190aef 1490 return 1;
cec736d2
LP
1491 }
1492
de190aef
LP
1493 return generic_array_get(f, first, i-1, ret, offset);
1494}
cec736d2 1495
de190aef
LP
1496enum {
1497 TEST_FOUND,
1498 TEST_LEFT,
1499 TEST_RIGHT
1500};
cec736d2 1501
f268980d
LP
1502static int generic_array_bisect(
1503 JournalFile *f,
1504 uint64_t first,
1505 uint64_t n,
1506 uint64_t needle,
1507 int (*test_object)(JournalFile *f, uint64_t p, uint64_t needle),
1508 direction_t direction,
1509 Object **ret,
1510 uint64_t *offset,
1511 uint64_t *idx) {
1512
1513 uint64_t a, p, t = 0, i = 0, last_p = 0, last_index = (uint64_t) -1;
de190aef
LP
1514 bool subtract_one = false;
1515 Object *o, *array = NULL;
1516 int r;
a4bcff5b 1517 ChainCacheItem *ci;
cec736d2 1518
de190aef
LP
1519 assert(f);
1520 assert(test_object);
cec736d2 1521
a4bcff5b 1522 /* Start with the first array in the chain */
de190aef 1523 a = first;
a4bcff5b
LP
1524
1525 ci = hashmap_get(f->chain_cache, &first);
1526 if (ci && n > ci->total) {
1527 /* Ah, we have iterated this bisection array chain
1528 * previously! Let's see if we can skip ahead in the
1529 * chain, as far as the last time. But we can't jump
1530 * backwards in the chain, so let's check that
1531 * first. */
1532
1533 r = test_object(f, ci->begin, needle);
1534 if (r < 0)
1535 return r;
1536
1537 if (r == TEST_LEFT) {
f268980d 1538 /* OK, what we are looking for is right of the
a4bcff5b
LP
1539 * begin of this EntryArray, so let's jump
1540 * straight to previously cached array in the
1541 * chain */
1542
1543 a = ci->array;
1544 n -= ci->total;
1545 t = ci->total;
f268980d 1546 last_index = ci->last_index;
a4bcff5b
LP
1547 }
1548 }
1549
de190aef
LP
1550 while (a > 0) {
1551 uint64_t left, right, k, lp;
1552
1553 r = journal_file_move_to_object(f, OBJECT_ENTRY_ARRAY, a, &array);
cec736d2
LP
1554 if (r < 0)
1555 return r;
1556
de190aef
LP
1557 k = journal_file_entry_array_n_items(array);
1558 right = MIN(k, n);
1559 if (right <= 0)
1560 return 0;
cec736d2 1561
de190aef
LP
1562 i = right - 1;
1563 lp = p = le64toh(array->entry_array.items[i]);
1564 if (p <= 0)
1565 return -EBADMSG;
cec736d2 1566
de190aef
LP
1567 r = test_object(f, p, needle);
1568 if (r < 0)
1569 return r;
cec736d2 1570
de190aef
LP
1571 if (r == TEST_FOUND)
1572 r = direction == DIRECTION_DOWN ? TEST_RIGHT : TEST_LEFT;
1573
1574 if (r == TEST_RIGHT) {
1575 left = 0;
1576 right -= 1;
f268980d
LP
1577
1578 if (last_index != (uint64_t) -1) {
1579 assert(last_index <= right);
1580
1581 /* If we cached the last index we
1582 * looked at, let's try to not to jump
1583 * too wildly around and see if we can
1584 * limit the range to look at early to
1585 * the immediate neighbors of the last
1586 * index we looked at. */
1587
1588 if (last_index > 0) {
1589 uint64_t x = last_index - 1;
1590
1591 p = le64toh(array->entry_array.items[x]);
1592 if (p <= 0)
1593 return -EBADMSG;
1594
1595 r = test_object(f, p, needle);
1596 if (r < 0)
1597 return r;
1598
1599 if (r == TEST_FOUND)
1600 r = direction == DIRECTION_DOWN ? TEST_RIGHT : TEST_LEFT;
1601
1602 if (r == TEST_RIGHT)
1603 right = x;
1604 else
1605 left = x + 1;
1606 }
1607
1608 if (last_index < right) {
1609 uint64_t y = last_index + 1;
1610
1611 p = le64toh(array->entry_array.items[y]);
1612 if (p <= 0)
1613 return -EBADMSG;
1614
1615 r = test_object(f, p, needle);
1616 if (r < 0)
1617 return r;
1618
1619 if (r == TEST_FOUND)
1620 r = direction == DIRECTION_DOWN ? TEST_RIGHT : TEST_LEFT;
1621
1622 if (r == TEST_RIGHT)
1623 right = y;
1624 else
1625 left = y + 1;
1626 }
f268980d
LP
1627 }
1628
de190aef
LP
1629 for (;;) {
1630 if (left == right) {
1631 if (direction == DIRECTION_UP)
1632 subtract_one = true;
1633
1634 i = left;
1635 goto found;
1636 }
1637
1638 assert(left < right);
de190aef 1639 i = (left + right) / 2;
f268980d 1640
de190aef
LP
1641 p = le64toh(array->entry_array.items[i]);
1642 if (p <= 0)
1643 return -EBADMSG;
1644
1645 r = test_object(f, p, needle);
1646 if (r < 0)
1647 return r;
cec736d2 1648
de190aef
LP
1649 if (r == TEST_FOUND)
1650 r = direction == DIRECTION_DOWN ? TEST_RIGHT : TEST_LEFT;
1651
1652 if (r == TEST_RIGHT)
1653 right = i;
1654 else
1655 left = i + 1;
1656 }
1657 }
1658
cbdca852
LP
1659 if (k > n) {
1660 if (direction == DIRECTION_UP) {
1661 i = n;
1662 subtract_one = true;
1663 goto found;
1664 }
1665
cec736d2 1666 return 0;
cbdca852 1667 }
cec736d2 1668
de190aef
LP
1669 last_p = lp;
1670
1671 n -= k;
1672 t += k;
f268980d 1673 last_index = (uint64_t) -1;
de190aef 1674 a = le64toh(array->entry_array.next_entry_array_offset);
cec736d2
LP
1675 }
1676
1677 return 0;
de190aef
LP
1678
1679found:
1680 if (subtract_one && t == 0 && i == 0)
1681 return 0;
1682
a4bcff5b 1683 /* Let's cache this item for the next invocation */
af13a6b0 1684 chain_cache_put(f->chain_cache, ci, first, a, le64toh(array->entry_array.items[0]), t, subtract_one ? (i > 0 ? i-1 : (uint64_t) -1) : i);
a4bcff5b 1685
de190aef
LP
1686 if (subtract_one && i == 0)
1687 p = last_p;
1688 else if (subtract_one)
1689 p = le64toh(array->entry_array.items[i-1]);
1690 else
1691 p = le64toh(array->entry_array.items[i]);
1692
1693 r = journal_file_move_to_object(f, OBJECT_ENTRY, p, &o);
1694 if (r < 0)
1695 return r;
1696
1697 if (ret)
1698 *ret = o;
1699
1700 if (offset)
1701 *offset = p;
1702
1703 if (idx)
cbdca852 1704 *idx = t + i + (subtract_one ? -1 : 0);
de190aef
LP
1705
1706 return 1;
cec736d2
LP
1707}
1708
f268980d
LP
1709
1710static int generic_array_bisect_plus_one(
1711 JournalFile *f,
1712 uint64_t extra,
1713 uint64_t first,
1714 uint64_t n,
1715 uint64_t needle,
1716 int (*test_object)(JournalFile *f, uint64_t p, uint64_t needle),
1717 direction_t direction,
1718 Object **ret,
1719 uint64_t *offset,
1720 uint64_t *idx) {
de190aef 1721
cec736d2 1722 int r;
cbdca852
LP
1723 bool step_back = false;
1724 Object *o;
cec736d2
LP
1725
1726 assert(f);
de190aef 1727 assert(test_object);
cec736d2 1728
de190aef
LP
1729 if (n <= 0)
1730 return 0;
cec736d2 1731
de190aef
LP
1732 /* This bisects the array in object 'first', but first checks
1733 * an extra */
de190aef
LP
1734 r = test_object(f, extra, needle);
1735 if (r < 0)
1736 return r;
a536e261
LP
1737
1738 if (r == TEST_FOUND)
1739 r = direction == DIRECTION_DOWN ? TEST_RIGHT : TEST_LEFT;
1740
cbdca852
LP
1741 /* if we are looking with DIRECTION_UP then we need to first
1742 see if in the actual array there is a matching entry, and
1743 return the last one of that. But if there isn't any we need
1744 to return this one. Hence remember this, and return it
1745 below. */
1746 if (r == TEST_LEFT)
1747 step_back = direction == DIRECTION_UP;
de190aef 1748
cbdca852
LP
1749 if (r == TEST_RIGHT) {
1750 if (direction == DIRECTION_DOWN)
1751 goto found;
1752 else
1753 return 0;
a536e261 1754 }
cec736d2 1755
de190aef
LP
1756 r = generic_array_bisect(f, first, n-1, needle, test_object, direction, ret, offset, idx);
1757
cbdca852
LP
1758 if (r == 0 && step_back)
1759 goto found;
1760
ecf68b1d 1761 if (r > 0 && idx)
de190aef
LP
1762 (*idx) ++;
1763
1764 return r;
cbdca852
LP
1765
1766found:
1767 r = journal_file_move_to_object(f, OBJECT_ENTRY, extra, &o);
1768 if (r < 0)
1769 return r;
1770
1771 if (ret)
1772 *ret = o;
1773
1774 if (offset)
1775 *offset = extra;
1776
1777 if (idx)
1778 *idx = 0;
1779
1780 return 1;
1781}
1782
44a6b1b6 1783_pure_ static int test_object_offset(JournalFile *f, uint64_t p, uint64_t needle) {
cbdca852
LP
1784 assert(f);
1785 assert(p > 0);
1786
1787 if (p == needle)
1788 return TEST_FOUND;
1789 else if (p < needle)
1790 return TEST_LEFT;
1791 else
1792 return TEST_RIGHT;
1793}
1794
1795int journal_file_move_to_entry_by_offset(
1796 JournalFile *f,
1797 uint64_t p,
1798 direction_t direction,
1799 Object **ret,
1800 uint64_t *offset) {
1801
1802 return generic_array_bisect(f,
1803 le64toh(f->header->entry_array_offset),
1804 le64toh(f->header->n_entries),
1805 p,
1806 test_object_offset,
1807 direction,
1808 ret, offset, NULL);
de190aef
LP
1809}
1810
cbdca852 1811
de190aef
LP
1812static int test_object_seqnum(JournalFile *f, uint64_t p, uint64_t needle) {
1813 Object *o;
1814 int r;
1815
1816 assert(f);
1817 assert(p > 0);
1818
1819 r = journal_file_move_to_object(f, OBJECT_ENTRY, p, &o);
cec736d2
LP
1820 if (r < 0)
1821 return r;
1822
de190aef
LP
1823 if (le64toh(o->entry.seqnum) == needle)
1824 return TEST_FOUND;
1825 else if (le64toh(o->entry.seqnum) < needle)
1826 return TEST_LEFT;
1827 else
1828 return TEST_RIGHT;
1829}
cec736d2 1830
de190aef
LP
1831int journal_file_move_to_entry_by_seqnum(
1832 JournalFile *f,
1833 uint64_t seqnum,
1834 direction_t direction,
1835 Object **ret,
1836 uint64_t *offset) {
1837
1838 return generic_array_bisect(f,
1839 le64toh(f->header->entry_array_offset),
1840 le64toh(f->header->n_entries),
1841 seqnum,
1842 test_object_seqnum,
1843 direction,
1844 ret, offset, NULL);
1845}
cec736d2 1846
de190aef
LP
1847static int test_object_realtime(JournalFile *f, uint64_t p, uint64_t needle) {
1848 Object *o;
1849 int r;
1850
1851 assert(f);
1852 assert(p > 0);
1853
1854 r = journal_file_move_to_object(f, OBJECT_ENTRY, p, &o);
1855 if (r < 0)
1856 return r;
1857
1858 if (le64toh(o->entry.realtime) == needle)
1859 return TEST_FOUND;
1860 else if (le64toh(o->entry.realtime) < needle)
1861 return TEST_LEFT;
1862 else
1863 return TEST_RIGHT;
cec736d2
LP
1864}
1865
de190aef
LP
1866int journal_file_move_to_entry_by_realtime(
1867 JournalFile *f,
1868 uint64_t realtime,
1869 direction_t direction,
1870 Object **ret,
1871 uint64_t *offset) {
1872
1873 return generic_array_bisect(f,
1874 le64toh(f->header->entry_array_offset),
1875 le64toh(f->header->n_entries),
1876 realtime,
1877 test_object_realtime,
1878 direction,
1879 ret, offset, NULL);
1880}
1881
1882static int test_object_monotonic(JournalFile *f, uint64_t p, uint64_t needle) {
1883 Object *o;
1884 int r;
1885
1886 assert(f);
1887 assert(p > 0);
1888
1889 r = journal_file_move_to_object(f, OBJECT_ENTRY, p, &o);
1890 if (r < 0)
1891 return r;
1892
1893 if (le64toh(o->entry.monotonic) == needle)
1894 return TEST_FOUND;
1895 else if (le64toh(o->entry.monotonic) < needle)
1896 return TEST_LEFT;
1897 else
1898 return TEST_RIGHT;
1899}
1900
47838ab3
ZJS
1901static inline int find_data_object_by_boot_id(
1902 JournalFile *f,
1903 sd_id128_t boot_id,
1904 Object **o,
1905 uint64_t *b) {
1906 char t[sizeof("_BOOT_ID=")-1 + 32 + 1] = "_BOOT_ID=";
1907
1908 sd_id128_to_string(boot_id, t + 9);
1909 return journal_file_find_data_object(f, t, sizeof(t) - 1, o, b);
1910}
1911
de190aef
LP
1912int journal_file_move_to_entry_by_monotonic(
1913 JournalFile *f,
1914 sd_id128_t boot_id,
1915 uint64_t monotonic,
1916 direction_t direction,
1917 Object **ret,
1918 uint64_t *offset) {
1919
de190aef
LP
1920 Object *o;
1921 int r;
1922
cbdca852 1923 assert(f);
de190aef 1924
47838ab3 1925 r = find_data_object_by_boot_id(f, boot_id, &o, NULL);
de190aef
LP
1926 if (r < 0)
1927 return r;
cbdca852 1928 if (r == 0)
de190aef
LP
1929 return -ENOENT;
1930
1931 return generic_array_bisect_plus_one(f,
1932 le64toh(o->data.entry_offset),
1933 le64toh(o->data.entry_array_offset),
1934 le64toh(o->data.n_entries),
1935 monotonic,
1936 test_object_monotonic,
1937 direction,
1938 ret, offset, NULL);
1939}
1940
de190aef
LP
1941int journal_file_next_entry(
1942 JournalFile *f,
1943 Object *o, uint64_t p,
1944 direction_t direction,
1945 Object **ret, uint64_t *offset) {
1946
fb099c8d 1947 uint64_t i, n, ofs;
cec736d2
LP
1948 int r;
1949
1950 assert(f);
de190aef
LP
1951 assert(p > 0 || !o);
1952
1953 n = le64toh(f->header->n_entries);
1954 if (n <= 0)
1955 return 0;
cec736d2
LP
1956
1957 if (!o)
de190aef 1958 i = direction == DIRECTION_DOWN ? 0 : n - 1;
cec736d2 1959 else {
de190aef 1960 if (o->object.type != OBJECT_ENTRY)
cec736d2
LP
1961 return -EINVAL;
1962
de190aef
LP
1963 r = generic_array_bisect(f,
1964 le64toh(f->header->entry_array_offset),
1965 le64toh(f->header->n_entries),
1966 p,
1967 test_object_offset,
1968 DIRECTION_DOWN,
1969 NULL, NULL,
1970 &i);
1971 if (r <= 0)
1972 return r;
1973
1974 if (direction == DIRECTION_DOWN) {
1975 if (i >= n - 1)
1976 return 0;
1977
1978 i++;
1979 } else {
1980 if (i <= 0)
1981 return 0;
1982
1983 i--;
1984 }
cec736d2
LP
1985 }
1986
de190aef 1987 /* And jump to it */
fb099c8d
ZJS
1988 r = generic_array_get(f,
1989 le64toh(f->header->entry_array_offset),
1990 i,
1991 ret, &ofs);
1992 if (r <= 0)
1993 return r;
1994
1995 if (p > 0 &&
1996 (direction == DIRECTION_DOWN ? ofs <= p : ofs >= p)) {
1997 log_debug("%s: entry array corrupted at entry %"PRIu64,
1998 f->path, i);
1999 return -EBADMSG;
2000 }
2001
2002 if (offset)
2003 *offset = ofs;
2004
2005 return 1;
de190aef 2006}
cec736d2 2007
de190aef
LP
2008int journal_file_skip_entry(
2009 JournalFile *f,
2010 Object *o, uint64_t p,
2011 int64_t skip,
2012 Object **ret, uint64_t *offset) {
2013
2014 uint64_t i, n;
2015 int r;
2016
2017 assert(f);
2018 assert(o);
2019 assert(p > 0);
2020
2021 if (o->object.type != OBJECT_ENTRY)
2022 return -EINVAL;
2023
2024 r = generic_array_bisect(f,
2025 le64toh(f->header->entry_array_offset),
2026 le64toh(f->header->n_entries),
2027 p,
2028 test_object_offset,
2029 DIRECTION_DOWN,
2030 NULL, NULL,
2031 &i);
2032 if (r <= 0)
cec736d2
LP
2033 return r;
2034
de190aef
LP
2035 /* Calculate new index */
2036 if (skip < 0) {
2037 if ((uint64_t) -skip >= i)
2038 i = 0;
2039 else
2040 i = i - (uint64_t) -skip;
2041 } else
2042 i += (uint64_t) skip;
cec736d2 2043
de190aef
LP
2044 n = le64toh(f->header->n_entries);
2045 if (n <= 0)
2046 return -EBADMSG;
cec736d2 2047
de190aef
LP
2048 if (i >= n)
2049 i = n-1;
2050
2051 return generic_array_get(f,
2052 le64toh(f->header->entry_array_offset),
2053 i,
2054 ret, offset);
cec736d2
LP
2055}
2056
de190aef
LP
2057int journal_file_next_entry_for_data(
2058 JournalFile *f,
2059 Object *o, uint64_t p,
2060 uint64_t data_offset,
2061 direction_t direction,
2062 Object **ret, uint64_t *offset) {
2063
2064 uint64_t n, i;
cec736d2 2065 int r;
de190aef 2066 Object *d;
cec736d2
LP
2067
2068 assert(f);
de190aef 2069 assert(p > 0 || !o);
cec736d2 2070
de190aef 2071 r = journal_file_move_to_object(f, OBJECT_DATA, data_offset, &d);
466ccd92 2072 if (r < 0)
de190aef 2073 return r;
cec736d2 2074
de190aef
LP
2075 n = le64toh(d->data.n_entries);
2076 if (n <= 0)
2077 return n;
cec736d2 2078
de190aef
LP
2079 if (!o)
2080 i = direction == DIRECTION_DOWN ? 0 : n - 1;
2081 else {
2082 if (o->object.type != OBJECT_ENTRY)
2083 return -EINVAL;
cec736d2 2084
de190aef
LP
2085 r = generic_array_bisect_plus_one(f,
2086 le64toh(d->data.entry_offset),
2087 le64toh(d->data.entry_array_offset),
2088 le64toh(d->data.n_entries),
2089 p,
2090 test_object_offset,
2091 DIRECTION_DOWN,
2092 NULL, NULL,
2093 &i);
2094
2095 if (r <= 0)
cec736d2
LP
2096 return r;
2097
de190aef
LP
2098 if (direction == DIRECTION_DOWN) {
2099 if (i >= n - 1)
2100 return 0;
cec736d2 2101
de190aef
LP
2102 i++;
2103 } else {
2104 if (i <= 0)
2105 return 0;
cec736d2 2106
de190aef
LP
2107 i--;
2108 }
cec736d2 2109
de190aef 2110 }
cec736d2 2111
de190aef
LP
2112 return generic_array_get_plus_one(f,
2113 le64toh(d->data.entry_offset),
2114 le64toh(d->data.entry_array_offset),
2115 i,
2116 ret, offset);
2117}
cec736d2 2118
cbdca852
LP
2119int journal_file_move_to_entry_by_offset_for_data(
2120 JournalFile *f,
2121 uint64_t data_offset,
2122 uint64_t p,
2123 direction_t direction,
2124 Object **ret, uint64_t *offset) {
2125
2126 int r;
2127 Object *d;
2128
2129 assert(f);
2130
2131 r = journal_file_move_to_object(f, OBJECT_DATA, data_offset, &d);
2132 if (r < 0)
2133 return r;
2134
2135 return generic_array_bisect_plus_one(f,
2136 le64toh(d->data.entry_offset),
2137 le64toh(d->data.entry_array_offset),
2138 le64toh(d->data.n_entries),
2139 p,
2140 test_object_offset,
2141 direction,
2142 ret, offset, NULL);
2143}
2144
2145int journal_file_move_to_entry_by_monotonic_for_data(
2146 JournalFile *f,
2147 uint64_t data_offset,
2148 sd_id128_t boot_id,
2149 uint64_t monotonic,
2150 direction_t direction,
2151 Object **ret, uint64_t *offset) {
2152
cbdca852
LP
2153 Object *o, *d;
2154 int r;
2155 uint64_t b, z;
2156
2157 assert(f);
2158
2159 /* First, seek by time */
47838ab3 2160 r = find_data_object_by_boot_id(f, boot_id, &o, &b);
cbdca852
LP
2161 if (r < 0)
2162 return r;
2163 if (r == 0)
2164 return -ENOENT;
2165
2166 r = generic_array_bisect_plus_one(f,
2167 le64toh(o->data.entry_offset),
2168 le64toh(o->data.entry_array_offset),
2169 le64toh(o->data.n_entries),
2170 monotonic,
2171 test_object_monotonic,
2172 direction,
2173 NULL, &z, NULL);
2174 if (r <= 0)
2175 return r;
2176
2177 /* And now, continue seeking until we find an entry that
2178 * exists in both bisection arrays */
2179
2180 for (;;) {
2181 Object *qo;
2182 uint64_t p, q;
2183
2184 r = journal_file_move_to_object(f, OBJECT_DATA, data_offset, &d);
2185 if (r < 0)
2186 return r;
2187
2188 r = generic_array_bisect_plus_one(f,
2189 le64toh(d->data.entry_offset),
2190 le64toh(d->data.entry_array_offset),
2191 le64toh(d->data.n_entries),
2192 z,
2193 test_object_offset,
2194 direction,
2195 NULL, &p, NULL);
2196 if (r <= 0)
2197 return r;
2198
2199 r = journal_file_move_to_object(f, OBJECT_DATA, b, &o);
2200 if (r < 0)
2201 return r;
2202
2203 r = generic_array_bisect_plus_one(f,
2204 le64toh(o->data.entry_offset),
2205 le64toh(o->data.entry_array_offset),
2206 le64toh(o->data.n_entries),
2207 p,
2208 test_object_offset,
2209 direction,
2210 &qo, &q, NULL);
2211
2212 if (r <= 0)
2213 return r;
2214
2215 if (p == q) {
2216 if (ret)
2217 *ret = qo;
2218 if (offset)
2219 *offset = q;
2220
2221 return 1;
2222 }
2223
2224 z = q;
2225 }
cbdca852
LP
2226}
2227
de190aef
LP
2228int journal_file_move_to_entry_by_seqnum_for_data(
2229 JournalFile *f,
2230 uint64_t data_offset,
2231 uint64_t seqnum,
2232 direction_t direction,
2233 Object **ret, uint64_t *offset) {
cec736d2 2234
de190aef
LP
2235 Object *d;
2236 int r;
cec736d2 2237
91a31dde
LP
2238 assert(f);
2239
de190aef 2240 r = journal_file_move_to_object(f, OBJECT_DATA, data_offset, &d);
91a31dde 2241 if (r < 0)
de190aef 2242 return r;
cec736d2 2243
de190aef
LP
2244 return generic_array_bisect_plus_one(f,
2245 le64toh(d->data.entry_offset),
2246 le64toh(d->data.entry_array_offset),
2247 le64toh(d->data.n_entries),
2248 seqnum,
2249 test_object_seqnum,
2250 direction,
2251 ret, offset, NULL);
2252}
cec736d2 2253
de190aef
LP
2254int journal_file_move_to_entry_by_realtime_for_data(
2255 JournalFile *f,
2256 uint64_t data_offset,
2257 uint64_t realtime,
2258 direction_t direction,
2259 Object **ret, uint64_t *offset) {
2260
2261 Object *d;
2262 int r;
2263
91a31dde
LP
2264 assert(f);
2265
de190aef 2266 r = journal_file_move_to_object(f, OBJECT_DATA, data_offset, &d);
91a31dde 2267 if (r < 0)
de190aef
LP
2268 return r;
2269
2270 return generic_array_bisect_plus_one(f,
2271 le64toh(d->data.entry_offset),
2272 le64toh(d->data.entry_array_offset),
2273 le64toh(d->data.n_entries),
2274 realtime,
2275 test_object_realtime,
2276 direction,
2277 ret, offset, NULL);
cec736d2
LP
2278}
2279
0284adc6 2280void journal_file_dump(JournalFile *f) {
7560fffc 2281 Object *o;
7560fffc 2282 int r;
0284adc6 2283 uint64_t p;
7560fffc
LP
2284
2285 assert(f);
2286
0284adc6 2287 journal_file_print_header(f);
7560fffc 2288
0284adc6
LP
2289 p = le64toh(f->header->header_size);
2290 while (p != 0) {
2291 r = journal_file_move_to_object(f, -1, p, &o);
2292 if (r < 0)
2293 goto fail;
7560fffc 2294
0284adc6 2295 switch (o->object.type) {
d98cc1f2 2296
0284adc6
LP
2297 case OBJECT_UNUSED:
2298 printf("Type: OBJECT_UNUSED\n");
2299 break;
d98cc1f2 2300
0284adc6
LP
2301 case OBJECT_DATA:
2302 printf("Type: OBJECT_DATA\n");
2303 break;
7560fffc 2304
3c1668da
LP
2305 case OBJECT_FIELD:
2306 printf("Type: OBJECT_FIELD\n");
2307 break;
2308
0284adc6 2309 case OBJECT_ENTRY:
507f22bd
ZJS
2310 printf("Type: OBJECT_ENTRY seqnum=%"PRIu64" monotonic=%"PRIu64" realtime=%"PRIu64"\n",
2311 le64toh(o->entry.seqnum),
2312 le64toh(o->entry.monotonic),
2313 le64toh(o->entry.realtime));
0284adc6 2314 break;
7560fffc 2315
0284adc6
LP
2316 case OBJECT_FIELD_HASH_TABLE:
2317 printf("Type: OBJECT_FIELD_HASH_TABLE\n");
2318 break;
7560fffc 2319
0284adc6
LP
2320 case OBJECT_DATA_HASH_TABLE:
2321 printf("Type: OBJECT_DATA_HASH_TABLE\n");
2322 break;
7560fffc 2323
0284adc6
LP
2324 case OBJECT_ENTRY_ARRAY:
2325 printf("Type: OBJECT_ENTRY_ARRAY\n");
2326 break;
7560fffc 2327
0284adc6 2328 case OBJECT_TAG:
507f22bd
ZJS
2329 printf("Type: OBJECT_TAG seqnum=%"PRIu64" epoch=%"PRIu64"\n",
2330 le64toh(o->tag.seqnum),
2331 le64toh(o->tag.epoch));
0284adc6 2332 break;
3c1668da
LP
2333
2334 default:
2335 printf("Type: unknown (%u)\n", o->object.type);
2336 break;
0284adc6 2337 }
7560fffc 2338
d89c8fdf
ZJS
2339 if (o->object.flags & OBJECT_COMPRESSION_MASK)
2340 printf("Flags: %s\n",
2341 object_compressed_to_string(o->object.flags & OBJECT_COMPRESSION_MASK));
7560fffc 2342
0284adc6
LP
2343 if (p == le64toh(f->header->tail_object_offset))
2344 p = 0;
2345 else
2346 p = p + ALIGN64(le64toh(o->object.size));
2347 }
7560fffc 2348
0284adc6
LP
2349 return;
2350fail:
2351 log_error("File corrupt");
7560fffc
LP
2352}
2353
718fe4b1
ZJS
2354static const char* format_timestamp_safe(char *buf, size_t l, usec_t t) {
2355 const char *x;
2356
2357 x = format_timestamp(buf, l, t);
2358 if (x)
2359 return x;
2360 return " --- ";
2361}
2362
0284adc6 2363void journal_file_print_header(JournalFile *f) {
2765b7bb 2364 char a[33], b[33], c[33], d[33];
ed375beb 2365 char x[FORMAT_TIMESTAMP_MAX], y[FORMAT_TIMESTAMP_MAX], z[FORMAT_TIMESTAMP_MAX];
a1a03e30
LP
2366 struct stat st;
2367 char bytes[FORMAT_BYTES_MAX];
7560fffc
LP
2368
2369 assert(f);
7560fffc 2370
0284adc6
LP
2371 printf("File Path: %s\n"
2372 "File ID: %s\n"
2373 "Machine ID: %s\n"
2374 "Boot ID: %s\n"
2375 "Sequential Number ID: %s\n"
2376 "State: %s\n"
2377 "Compatible Flags:%s%s\n"
d89c8fdf 2378 "Incompatible Flags:%s%s%s\n"
507f22bd
ZJS
2379 "Header size: %"PRIu64"\n"
2380 "Arena size: %"PRIu64"\n"
2381 "Data Hash Table Size: %"PRIu64"\n"
2382 "Field Hash Table Size: %"PRIu64"\n"
0284adc6 2383 "Rotate Suggested: %s\n"
507f22bd
ZJS
2384 "Head Sequential Number: %"PRIu64"\n"
2385 "Tail Sequential Number: %"PRIu64"\n"
0284adc6 2386 "Head Realtime Timestamp: %s\n"
3223f44f 2387 "Tail Realtime Timestamp: %s\n"
ed375beb 2388 "Tail Monotonic Timestamp: %s\n"
507f22bd
ZJS
2389 "Objects: %"PRIu64"\n"
2390 "Entry Objects: %"PRIu64"\n",
0284adc6
LP
2391 f->path,
2392 sd_id128_to_string(f->header->file_id, a),
2393 sd_id128_to_string(f->header->machine_id, b),
2394 sd_id128_to_string(f->header->boot_id, c),
2765b7bb 2395 sd_id128_to_string(f->header->seqnum_id, d),
3223f44f
LP
2396 f->header->state == STATE_OFFLINE ? "OFFLINE" :
2397 f->header->state == STATE_ONLINE ? "ONLINE" :
2398 f->header->state == STATE_ARCHIVED ? "ARCHIVED" : "UNKNOWN",
8088cbd3 2399 JOURNAL_HEADER_SEALED(f->header) ? " SEALED" : "",
d89c8fdf
ZJS
2400 (le32toh(f->header->compatible_flags) & ~HEADER_COMPATIBLE_ANY) ? " ???" : "",
2401 JOURNAL_HEADER_COMPRESSED_XZ(f->header) ? " COMPRESSED-XZ" : "",
2402 JOURNAL_HEADER_COMPRESSED_LZ4(f->header) ? " COMPRESSED-LZ4" : "",
2403 (le32toh(f->header->incompatible_flags) & ~HEADER_INCOMPATIBLE_ANY) ? " ???" : "",
507f22bd
ZJS
2404 le64toh(f->header->header_size),
2405 le64toh(f->header->arena_size),
2406 le64toh(f->header->data_hash_table_size) / sizeof(HashItem),
2407 le64toh(f->header->field_hash_table_size) / sizeof(HashItem),
fb0951b0 2408 yes_no(journal_file_rotate_suggested(f, 0)),
507f22bd
ZJS
2409 le64toh(f->header->head_entry_seqnum),
2410 le64toh(f->header->tail_entry_seqnum),
718fe4b1
ZJS
2411 format_timestamp_safe(x, sizeof(x), le64toh(f->header->head_entry_realtime)),
2412 format_timestamp_safe(y, sizeof(y), le64toh(f->header->tail_entry_realtime)),
ed375beb 2413 format_timespan(z, sizeof(z), le64toh(f->header->tail_entry_monotonic), USEC_PER_MSEC),
507f22bd
ZJS
2414 le64toh(f->header->n_objects),
2415 le64toh(f->header->n_entries));
7560fffc 2416
0284adc6 2417 if (JOURNAL_HEADER_CONTAINS(f->header, n_data))
507f22bd 2418 printf("Data Objects: %"PRIu64"\n"
0284adc6 2419 "Data Hash Table Fill: %.1f%%\n",
507f22bd 2420 le64toh(f->header->n_data),
0284adc6 2421 100.0 * (double) le64toh(f->header->n_data) / ((double) (le64toh(f->header->data_hash_table_size) / sizeof(HashItem))));
7560fffc 2422
0284adc6 2423 if (JOURNAL_HEADER_CONTAINS(f->header, n_fields))
507f22bd 2424 printf("Field Objects: %"PRIu64"\n"
0284adc6 2425 "Field Hash Table Fill: %.1f%%\n",
507f22bd 2426 le64toh(f->header->n_fields),
0284adc6 2427 100.0 * (double) le64toh(f->header->n_fields) / ((double) (le64toh(f->header->field_hash_table_size) / sizeof(HashItem))));
3223f44f
LP
2428
2429 if (JOURNAL_HEADER_CONTAINS(f->header, n_tags))
507f22bd
ZJS
2430 printf("Tag Objects: %"PRIu64"\n",
2431 le64toh(f->header->n_tags));
3223f44f 2432 if (JOURNAL_HEADER_CONTAINS(f->header, n_entry_arrays))
507f22bd
ZJS
2433 printf("Entry Array Objects: %"PRIu64"\n",
2434 le64toh(f->header->n_entry_arrays));
a1a03e30
LP
2435
2436 if (fstat(f->fd, &st) >= 0)
2437 printf("Disk usage: %s\n", format_bytes(bytes, sizeof(bytes), (off_t) st.st_blocks * 512ULL));
7560fffc
LP
2438}
2439
0284adc6
LP
2440int journal_file_open(
2441 const char *fname,
2442 int flags,
2443 mode_t mode,
2444 bool compress,
baed47c3 2445 bool seal,
0284adc6
LP
2446 JournalMetrics *metrics,
2447 MMapCache *mmap_cache,
2448 JournalFile *template,
2449 JournalFile **ret) {
7560fffc 2450
0284adc6
LP
2451 JournalFile *f;
2452 int r;
2453 bool newly_created = false;
7560fffc 2454
0284adc6 2455 assert(fname);
0559d3a5 2456 assert(ret);
7560fffc 2457
0284adc6
LP
2458 if ((flags & O_ACCMODE) != O_RDONLY &&
2459 (flags & O_ACCMODE) != O_RDWR)
2460 return -EINVAL;
7560fffc 2461
a0108012
LP
2462 if (!endswith(fname, ".journal") &&
2463 !endswith(fname, ".journal~"))
0284adc6 2464 return -EINVAL;
7560fffc 2465
0284adc6
LP
2466 f = new0(JournalFile, 1);
2467 if (!f)
2468 return -ENOMEM;
7560fffc 2469
0284adc6
LP
2470 f->fd = -1;
2471 f->mode = mode;
7560fffc 2472
0284adc6
LP
2473 f->flags = flags;
2474 f->prot = prot_from_flags(flags);
2475 f->writable = (flags & O_ACCMODE) != O_RDONLY;
92261977 2476#if defined(HAVE_LZ4)
d89c8fdf
ZJS
2477 f->compress_lz4 = compress;
2478#elif defined(HAVE_XZ)
2479 f->compress_xz = compress;
48b61739 2480#endif
49a32d43 2481#ifdef HAVE_GCRYPT
baed47c3 2482 f->seal = seal;
49a32d43 2483#endif
7560fffc 2484
0284adc6
LP
2485 if (mmap_cache)
2486 f->mmap = mmap_cache_ref(mmap_cache);
2487 else {
84168d80 2488 f->mmap = mmap_cache_new();
0284adc6
LP
2489 if (!f->mmap) {
2490 r = -ENOMEM;
2491 goto fail;
2492 }
2493 }
7560fffc 2494
0284adc6
LP
2495 f->path = strdup(fname);
2496 if (!f->path) {
2497 r = -ENOMEM;
2498 goto fail;
2499 }
7560fffc 2500
d5099efc 2501 f->chain_cache = hashmap_new(&uint64_hash_ops);
a4bcff5b
LP
2502 if (!f->chain_cache) {
2503 r = -ENOMEM;
2504 goto fail;
2505 }
2506
0284adc6
LP
2507 f->fd = open(f->path, f->flags|O_CLOEXEC, f->mode);
2508 if (f->fd < 0) {
2509 r = -errno;
2510 goto fail;
7560fffc 2511 }
7560fffc 2512
0284adc6
LP
2513 if (fstat(f->fd, &f->last_stat) < 0) {
2514 r = -errno;
2515 goto fail;
2516 }
7560fffc 2517
0284adc6 2518 if (f->last_stat.st_size == 0 && f->writable) {
fb0951b0
LP
2519 uint64_t crtime;
2520
2521 /* Let's attach the creation time to the journal file,
2522 * so that the vacuuming code knows the age of this
2523 * file even if the file might end up corrupted one
2524 * day... Ideally we'd just use the creation time many
2525 * file systems maintain for each file, but there is
2526 * currently no usable API to query this, hence let's
2527 * emulate this via extended attributes. If extended
2528 * attributes are not supported we'll just skip this,
2529 * and rely solely on mtime/atime/ctime of the file.*/
2530
2531 crtime = htole64((uint64_t) now(CLOCK_REALTIME));
2532 fsetxattr(f->fd, "user.crtime_usec", &crtime, sizeof(crtime), XATTR_CREATE);
7560fffc 2533
feb12d3e 2534#ifdef HAVE_GCRYPT
0284adc6 2535 /* Try to load the FSPRG state, and if we can't, then
baed47c3 2536 * just don't do sealing */
49a32d43
LP
2537 if (f->seal) {
2538 r = journal_file_fss_load(f);
2539 if (r < 0)
2540 f->seal = false;
2541 }
feb12d3e 2542#endif
7560fffc 2543
0284adc6
LP
2544 r = journal_file_init_header(f, template);
2545 if (r < 0)
2546 goto fail;
7560fffc 2547
0284adc6
LP
2548 if (fstat(f->fd, &f->last_stat) < 0) {
2549 r = -errno;
2550 goto fail;
2551 }
fb0951b0
LP
2552
2553 newly_created = true;
0284adc6 2554 }
7560fffc 2555
0284adc6
LP
2556 if (f->last_stat.st_size < (off_t) HEADER_SIZE_MIN) {
2557 r = -EIO;
2558 goto fail;
2559 }
7560fffc 2560
0284adc6
LP
2561 f->header = mmap(NULL, PAGE_ALIGN(sizeof(Header)), prot_from_flags(flags), MAP_SHARED, f->fd, 0);
2562 if (f->header == MAP_FAILED) {
2563 f->header = NULL;
2564 r = -errno;
2565 goto fail;
2566 }
7560fffc 2567
0284adc6
LP
2568 if (!newly_created) {
2569 r = journal_file_verify_header(f);
2570 if (r < 0)
2571 goto fail;
2572 }
7560fffc 2573
feb12d3e 2574#ifdef HAVE_GCRYPT
0284adc6 2575 if (!newly_created && f->writable) {
baed47c3 2576 r = journal_file_fss_load(f);
0284adc6
LP
2577 if (r < 0)
2578 goto fail;
2579 }
feb12d3e 2580#endif
cec736d2
LP
2581
2582 if (f->writable) {
4a92baf3
LP
2583 if (metrics) {
2584 journal_default_metrics(metrics, f->fd);
2585 f->metrics = *metrics;
2586 } else if (template)
2587 f->metrics = template->metrics;
2588
cec736d2
LP
2589 r = journal_file_refresh_header(f);
2590 if (r < 0)
2591 goto fail;
2592 }
2593
feb12d3e 2594#ifdef HAVE_GCRYPT
baed47c3 2595 r = journal_file_hmac_setup(f);
14d10188
LP
2596 if (r < 0)
2597 goto fail;
feb12d3e 2598#endif
14d10188 2599
cec736d2 2600 if (newly_created) {
de190aef 2601 r = journal_file_setup_field_hash_table(f);
cec736d2
LP
2602 if (r < 0)
2603 goto fail;
2604
de190aef 2605 r = journal_file_setup_data_hash_table(f);
cec736d2
LP
2606 if (r < 0)
2607 goto fail;
7560fffc 2608
feb12d3e 2609#ifdef HAVE_GCRYPT
7560fffc
LP
2610 r = journal_file_append_first_tag(f);
2611 if (r < 0)
2612 goto fail;
feb12d3e 2613#endif
cec736d2
LP
2614 }
2615
de190aef 2616 r = journal_file_map_field_hash_table(f);
cec736d2
LP
2617 if (r < 0)
2618 goto fail;
2619
de190aef 2620 r = journal_file_map_data_hash_table(f);
cec736d2
LP
2621 if (r < 0)
2622 goto fail;
2623
0559d3a5 2624 *ret = f;
cec736d2
LP
2625 return 0;
2626
2627fail:
2628 journal_file_close(f);
2629
2630 return r;
2631}
0ac38b70 2632
baed47c3 2633int journal_file_rotate(JournalFile **f, bool compress, bool seal) {
57535f47 2634 _cleanup_free_ char *p = NULL;
0ac38b70
LP
2635 size_t l;
2636 JournalFile *old_file, *new_file = NULL;
2637 int r;
2638
2639 assert(f);
2640 assert(*f);
2641
2642 old_file = *f;
2643
2644 if (!old_file->writable)
2645 return -EINVAL;
2646
2647 if (!endswith(old_file->path, ".journal"))
2648 return -EINVAL;
2649
2650 l = strlen(old_file->path);
57535f47
ZJS
2651 r = asprintf(&p, "%.*s@" SD_ID128_FORMAT_STR "-%016"PRIx64"-%016"PRIx64".journal",
2652 (int) l - 8, old_file->path,
2653 SD_ID128_FORMAT_VAL(old_file->header->seqnum_id),
2654 le64toh((*f)->header->head_entry_seqnum),
2655 le64toh((*f)->header->head_entry_realtime));
2656 if (r < 0)
0ac38b70
LP
2657 return -ENOMEM;
2658
0ac38b70 2659 r = rename(old_file->path, p);
0ac38b70
LP
2660 if (r < 0)
2661 return -errno;
2662
ccdbaf91 2663 old_file->header->state = STATE_ARCHIVED;
0ac38b70 2664
baed47c3 2665 r = journal_file_open(old_file->path, old_file->flags, old_file->mode, compress, seal, NULL, old_file->mmap, old_file, &new_file);
0ac38b70
LP
2666 journal_file_close(old_file);
2667
2668 *f = new_file;
2669 return r;
2670}
2671
9447a7f1
LP
2672int journal_file_open_reliably(
2673 const char *fname,
2674 int flags,
2675 mode_t mode,
7560fffc 2676 bool compress,
baed47c3 2677 bool seal,
4a92baf3 2678 JournalMetrics *metrics,
27370278 2679 MMapCache *mmap_cache,
9447a7f1
LP
2680 JournalFile *template,
2681 JournalFile **ret) {
2682
2683 int r;
2684 size_t l;
ed375beb 2685 _cleanup_free_ char *p = NULL;
9447a7f1 2686
baed47c3 2687 r = journal_file_open(fname, flags, mode, compress, seal,
27370278 2688 metrics, mmap_cache, template, ret);
0071d9f1
LP
2689 if (r != -EBADMSG && /* corrupted */
2690 r != -ENODATA && /* truncated */
2691 r != -EHOSTDOWN && /* other machine */
a1a1898f
LP
2692 r != -EPROTONOSUPPORT && /* incompatible feature */
2693 r != -EBUSY && /* unclean shutdown */
2694 r != -ESHUTDOWN /* already archived */)
9447a7f1
LP
2695 return r;
2696
2697 if ((flags & O_ACCMODE) == O_RDONLY)
2698 return r;
2699
2700 if (!(flags & O_CREAT))
2701 return r;
2702
7560fffc
LP
2703 if (!endswith(fname, ".journal"))
2704 return r;
2705
5c70eab4
LP
2706 /* The file is corrupted. Rotate it away and try it again (but only once) */
2707
9447a7f1 2708 l = strlen(fname);
9bf3b535 2709 if (asprintf(&p, "%.*s@%016llx-%016" PRIx64 ".journal~",
57535f47 2710 (int) l - 8, fname,
9447a7f1 2711 (unsigned long long) now(CLOCK_REALTIME),
9bf3b535 2712 random_u64()) < 0)
9447a7f1
LP
2713 return -ENOMEM;
2714
2715 r = rename(fname, p);
9447a7f1
LP
2716 if (r < 0)
2717 return -errno;
2718
a1a1898f 2719 log_warning("File %s corrupted or uncleanly shut down, renaming and replacing.", fname);
9447a7f1 2720
baed47c3 2721 return journal_file_open(fname, flags, mode, compress, seal,
27370278 2722 metrics, mmap_cache, template, ret);
9447a7f1
LP
2723}
2724
cf244689
LP
2725int journal_file_copy_entry(JournalFile *from, JournalFile *to, Object *o, uint64_t p, uint64_t *seqnum, Object **ret, uint64_t *offset) {
2726 uint64_t i, n;
2727 uint64_t q, xor_hash = 0;
2728 int r;
2729 EntryItem *items;
2730 dual_timestamp ts;
2731
2732 assert(from);
2733 assert(to);
2734 assert(o);
2735 assert(p);
2736
2737 if (!to->writable)
2738 return -EPERM;
2739
2740 ts.monotonic = le64toh(o->entry.monotonic);
2741 ts.realtime = le64toh(o->entry.realtime);
2742
cf244689 2743 n = journal_file_entry_n_items(o);
4faa7004
TA
2744 /* alloca() can't take 0, hence let's allocate at least one */
2745 items = alloca(sizeof(EntryItem) * MAX(1u, n));
cf244689
LP
2746
2747 for (i = 0; i < n; i++) {
4fd052ae
FC
2748 uint64_t l, h;
2749 le64_t le_hash;
cf244689
LP
2750 size_t t;
2751 void *data;
2752 Object *u;
2753
2754 q = le64toh(o->entry.items[i].object_offset);
2755 le_hash = o->entry.items[i].hash;
2756
2757 r = journal_file_move_to_object(from, OBJECT_DATA, q, &o);
2758 if (r < 0)
2759 return r;
2760
2761 if (le_hash != o->data.hash)
2762 return -EBADMSG;
2763
2764 l = le64toh(o->object.size) - offsetof(Object, data.payload);
2765 t = (size_t) l;
2766
2767 /* We hit the limit on 32bit machines */
2768 if ((uint64_t) t != l)
2769 return -E2BIG;
2770
d89c8fdf 2771 if (o->object.flags & OBJECT_COMPRESSION_MASK) {
3b1a55e1 2772#if defined(HAVE_XZ) || defined(HAVE_LZ4)
fa1c4b51 2773 size_t rsize;
cf244689 2774
d89c8fdf
ZJS
2775 r = decompress_blob(o->object.flags & OBJECT_COMPRESSION_MASK,
2776 o->data.payload, l, &from->compress_buffer, &from->compress_buffer_size, &rsize, 0);
2777 if (r < 0)
2778 return r;
cf244689
LP
2779
2780 data = from->compress_buffer;
2781 l = rsize;
3b1a55e1
ZJS
2782#else
2783 return -EPROTONOSUPPORT;
2784#endif
cf244689
LP
2785 } else
2786 data = o->data.payload;
2787
2788 r = journal_file_append_data(to, data, l, &u, &h);
2789 if (r < 0)
2790 return r;
2791
2792 xor_hash ^= le64toh(u->data.hash);
2793 items[i].object_offset = htole64(h);
2794 items[i].hash = u->data.hash;
2795
2796 r = journal_file_move_to_object(from, OBJECT_ENTRY, p, &o);
2797 if (r < 0)
2798 return r;
2799 }
2800
2801 return journal_file_append_entry_internal(to, &ts, xor_hash, items, n, seqnum, ret, offset);
2802}
babfc091
LP
2803
2804void journal_default_metrics(JournalMetrics *m, int fd) {
2805 uint64_t fs_size = 0;
2806 struct statvfs ss;
a7bc2c2a 2807 char a[FORMAT_BYTES_MAX], b[FORMAT_BYTES_MAX], c[FORMAT_BYTES_MAX], d[FORMAT_BYTES_MAX];
babfc091
LP
2808
2809 assert(m);
2810 assert(fd >= 0);
2811
2812 if (fstatvfs(fd, &ss) >= 0)
2813 fs_size = ss.f_frsize * ss.f_blocks;
2814
2815 if (m->max_use == (uint64_t) -1) {
2816
2817 if (fs_size > 0) {
2818 m->max_use = PAGE_ALIGN(fs_size / 10); /* 10% of file system size */
2819
2820 if (m->max_use > DEFAULT_MAX_USE_UPPER)
2821 m->max_use = DEFAULT_MAX_USE_UPPER;
2822
2823 if (m->max_use < DEFAULT_MAX_USE_LOWER)
2824 m->max_use = DEFAULT_MAX_USE_LOWER;
2825 } else
2826 m->max_use = DEFAULT_MAX_USE_LOWER;
2827 } else {
2828 m->max_use = PAGE_ALIGN(m->max_use);
2829
2830 if (m->max_use < JOURNAL_FILE_SIZE_MIN*2)
2831 m->max_use = JOURNAL_FILE_SIZE_MIN*2;
2832 }
2833
2834 if (m->max_size == (uint64_t) -1) {
2835 m->max_size = PAGE_ALIGN(m->max_use / 8); /* 8 chunks */
2836
2837 if (m->max_size > DEFAULT_MAX_SIZE_UPPER)
2838 m->max_size = DEFAULT_MAX_SIZE_UPPER;
2839 } else
2840 m->max_size = PAGE_ALIGN(m->max_size);
2841
2842 if (m->max_size < JOURNAL_FILE_SIZE_MIN)
2843 m->max_size = JOURNAL_FILE_SIZE_MIN;
2844
2845 if (m->max_size*2 > m->max_use)
2846 m->max_use = m->max_size*2;
2847
2848 if (m->min_size == (uint64_t) -1)
2849 m->min_size = JOURNAL_FILE_SIZE_MIN;
2850 else {
2851 m->min_size = PAGE_ALIGN(m->min_size);
2852
2853 if (m->min_size < JOURNAL_FILE_SIZE_MIN)
2854 m->min_size = JOURNAL_FILE_SIZE_MIN;
2855
2856 if (m->min_size > m->max_size)
2857 m->max_size = m->min_size;
2858 }
2859
2860 if (m->keep_free == (uint64_t) -1) {
2861
2862 if (fs_size > 0) {
8621b110 2863 m->keep_free = PAGE_ALIGN(fs_size * 3 / 20); /* 15% of file system size */
babfc091
LP
2864
2865 if (m->keep_free > DEFAULT_KEEP_FREE_UPPER)
2866 m->keep_free = DEFAULT_KEEP_FREE_UPPER;
2867
2868 } else
2869 m->keep_free = DEFAULT_KEEP_FREE;
2870 }
2871
2b43f939
LP
2872 log_debug("Fixed max_use=%s max_size=%s min_size=%s keep_free=%s",
2873 format_bytes(a, sizeof(a), m->max_use),
2874 format_bytes(b, sizeof(b), m->max_size),
2875 format_bytes(c, sizeof(c), m->min_size),
2876 format_bytes(d, sizeof(d), m->keep_free));
babfc091 2877}
08984293
LP
2878
2879int journal_file_get_cutoff_realtime_usec(JournalFile *f, usec_t *from, usec_t *to) {
08984293
LP
2880 assert(f);
2881 assert(from || to);
2882
2883 if (from) {
162566a4
LP
2884 if (f->header->head_entry_realtime == 0)
2885 return -ENOENT;
08984293 2886
162566a4 2887 *from = le64toh(f->header->head_entry_realtime);
08984293
LP
2888 }
2889
2890 if (to) {
162566a4
LP
2891 if (f->header->tail_entry_realtime == 0)
2892 return -ENOENT;
08984293 2893
162566a4 2894 *to = le64toh(f->header->tail_entry_realtime);
08984293
LP
2895 }
2896
2897 return 1;
2898}
2899
2900int journal_file_get_cutoff_monotonic_usec(JournalFile *f, sd_id128_t boot_id, usec_t *from, usec_t *to) {
08984293
LP
2901 Object *o;
2902 uint64_t p;
2903 int r;
2904
2905 assert(f);
2906 assert(from || to);
2907
47838ab3 2908 r = find_data_object_by_boot_id(f, boot_id, &o, &p);
08984293
LP
2909 if (r <= 0)
2910 return r;
2911
2912 if (le64toh(o->data.n_entries) <= 0)
2913 return 0;
2914
2915 if (from) {
2916 r = journal_file_move_to_object(f, OBJECT_ENTRY, le64toh(o->data.entry_offset), &o);
2917 if (r < 0)
2918 return r;
2919
2920 *from = le64toh(o->entry.monotonic);
2921 }
2922
2923 if (to) {
2924 r = journal_file_move_to_object(f, OBJECT_DATA, p, &o);
2925 if (r < 0)
2926 return r;
2927
2928 r = generic_array_get_plus_one(f,
2929 le64toh(o->data.entry_offset),
2930 le64toh(o->data.entry_array_offset),
2931 le64toh(o->data.n_entries)-1,
2932 &o, NULL);
2933 if (r <= 0)
2934 return r;
2935
2936 *to = le64toh(o->entry.monotonic);
2937 }
2938
2939 return 1;
2940}
dca6219e 2941
fb0951b0 2942bool journal_file_rotate_suggested(JournalFile *f, usec_t max_file_usec) {
dca6219e
LP
2943 assert(f);
2944
2945 /* If we gained new header fields we gained new features,
2946 * hence suggest a rotation */
361f9cbc
LP
2947 if (le64toh(f->header->header_size) < sizeof(Header)) {
2948 log_debug("%s uses an outdated header, suggesting rotation.", f->path);
dca6219e 2949 return true;
361f9cbc 2950 }
dca6219e
LP
2951
2952 /* Let's check if the hash tables grew over a certain fill
2953 * level (75%, borrowing this value from Java's hash table
2954 * implementation), and if so suggest a rotation. To calculate
2955 * the fill level we need the n_data field, which only exists
2956 * in newer versions. */
2957
2958 if (JOURNAL_HEADER_CONTAINS(f->header, n_data))
361f9cbc 2959 if (le64toh(f->header->n_data) * 4ULL > (le64toh(f->header->data_hash_table_size) / sizeof(HashItem)) * 3ULL) {
507f22bd 2960 log_debug("Data hash table of %s has a fill level at %.1f (%"PRIu64" of %"PRIu64" items, %llu file size, %"PRIu64" bytes per hash table item), suggesting rotation.",
361f9cbc
LP
2961 f->path,
2962 100.0 * (double) le64toh(f->header->n_data) / ((double) (le64toh(f->header->data_hash_table_size) / sizeof(HashItem))),
507f22bd
ZJS
2963 le64toh(f->header->n_data),
2964 le64toh(f->header->data_hash_table_size) / sizeof(HashItem),
2965 (unsigned long long) f->last_stat.st_size,
2966 f->last_stat.st_size / le64toh(f->header->n_data));
dca6219e 2967 return true;
361f9cbc 2968 }
dca6219e
LP
2969
2970 if (JOURNAL_HEADER_CONTAINS(f->header, n_fields))
361f9cbc 2971 if (le64toh(f->header->n_fields) * 4ULL > (le64toh(f->header->field_hash_table_size) / sizeof(HashItem)) * 3ULL) {
507f22bd 2972 log_debug("Field hash table of %s has a fill level at %.1f (%"PRIu64" of %"PRIu64" items), suggesting rotation.",
361f9cbc
LP
2973 f->path,
2974 100.0 * (double) le64toh(f->header->n_fields) / ((double) (le64toh(f->header->field_hash_table_size) / sizeof(HashItem))),
507f22bd
ZJS
2975 le64toh(f->header->n_fields),
2976 le64toh(f->header->field_hash_table_size) / sizeof(HashItem));
dca6219e 2977 return true;
361f9cbc 2978 }
dca6219e 2979
0598fd4a
LP
2980 /* Are the data objects properly indexed by field objects? */
2981 if (JOURNAL_HEADER_CONTAINS(f->header, n_data) &&
2982 JOURNAL_HEADER_CONTAINS(f->header, n_fields) &&
2983 le64toh(f->header->n_data) > 0 &&
2984 le64toh(f->header->n_fields) == 0)
2985 return true;
2986
fb0951b0
LP
2987 if (max_file_usec > 0) {
2988 usec_t t, h;
2989
2990 h = le64toh(f->header->head_entry_realtime);
2991 t = now(CLOCK_REALTIME);
2992
2993 if (h > 0 && t > h + max_file_usec)
2994 return true;
2995 }
2996
dca6219e
LP
2997 return false;
2998}