]> git.ipfire.org Git - thirdparty/systemd.git/blame - src/journal/journal-file.c
treewide: use log_*_errno whenever %m is in the format string
[thirdparty/systemd.git] / src / journal / journal-file.c
CommitLineData
cec736d2
LP
1/*-*- Mode: C; c-basic-offset: 8; indent-tabs-mode: nil -*-*/
2
3/***
4 This file is part of systemd.
5
6 Copyright 2011 Lennart Poettering
7
8 systemd is free software; you can redistribute it and/or modify it
5430f7f2
LP
9 under the terms of the GNU Lesser General Public License as published by
10 the Free Software Foundation; either version 2.1 of the License, or
cec736d2
LP
11 (at your option) any later version.
12
13 systemd is distributed in the hope that it will be useful, but
14 WITHOUT ANY WARRANTY; without even the implied warranty of
15 MERCHANTABILITY or FITNESS FOR A PARTICULAR PURPOSE. See the GNU
5430f7f2 16 Lesser General Public License for more details.
cec736d2 17
5430f7f2 18 You should have received a copy of the GNU Lesser General Public License
cec736d2
LP
19 along with systemd; If not, see <http://www.gnu.org/licenses/>.
20***/
21
22#include <sys/mman.h>
23#include <errno.h>
24#include <sys/uio.h>
25#include <unistd.h>
26#include <sys/statvfs.h>
27#include <fcntl.h>
28#include <stddef.h>
d2edfae0 29#include <sys/xattr.h>
fb0951b0 30
cec736d2
LP
31#include "journal-def.h"
32#include "journal-file.h"
0284adc6 33#include "journal-authenticate.h"
cec736d2 34#include "lookup3.h"
807e17f0 35#include "compress.h"
7560fffc 36#include "fsprg.h"
cec736d2 37
4a92baf3
LP
38#define DEFAULT_DATA_HASH_TABLE_SIZE (2047ULL*sizeof(HashItem))
39#define DEFAULT_FIELD_HASH_TABLE_SIZE (333ULL*sizeof(HashItem))
cec736d2 40
be19b7df 41#define COMPRESSION_SIZE_THRESHOLD (512ULL)
807e17f0 42
babfc091 43/* This is the minimum journal file size */
253f59df 44#define JOURNAL_FILE_SIZE_MIN (4ULL*1024ULL*1024ULL) /* 4 MiB */
babfc091
LP
45
46/* These are the lower and upper bounds if we deduce the max_use value
47 * from the file system size */
48#define DEFAULT_MAX_USE_LOWER (1ULL*1024ULL*1024ULL) /* 1 MiB */
49#define DEFAULT_MAX_USE_UPPER (4ULL*1024ULL*1024ULL*1024ULL) /* 4 GiB */
50
51/* This is the upper bound if we deduce max_size from max_use */
71100051 52#define DEFAULT_MAX_SIZE_UPPER (128ULL*1024ULL*1024ULL) /* 128 MiB */
babfc091
LP
53
54/* This is the upper bound if we deduce the keep_free value from the
55 * file system size */
56#define DEFAULT_KEEP_FREE_UPPER (4ULL*1024ULL*1024ULL*1024ULL) /* 4 GiB */
57
58/* This is the keep_free value when we can't determine the system
59 * size */
60#define DEFAULT_KEEP_FREE (1024ULL*1024ULL) /* 1 MB */
61
dca6219e
LP
62/* n_data was the first entry we added after the initial file format design */
63#define HEADER_SIZE_MIN ALIGN64(offsetof(Header, n_data))
cec736d2 64
a4bcff5b
LP
65/* How many entries to keep in the entry array chain cache at max */
66#define CHAIN_CACHE_MAX 20
67
a676e665
LP
68/* How much to increase the journal file size at once each time we allocate something new. */
69#define FILE_SIZE_INCREASE (8ULL*1024ULL*1024ULL) /* 8MB */
70
9588bc32 71static int journal_file_set_online(JournalFile *f) {
26687bf8
OS
72 assert(f);
73
74 if (!f->writable)
75 return -EPERM;
76
77 if (!(f->fd >= 0 && f->header))
78 return -EINVAL;
79
80 switch(f->header->state) {
81 case STATE_ONLINE:
82 return 0;
83
84 case STATE_OFFLINE:
85 f->header->state = STATE_ONLINE;
86 fsync(f->fd);
87 return 0;
88
89 default:
90 return -EINVAL;
91 }
92}
93
94int journal_file_set_offline(JournalFile *f) {
95 assert(f);
96
97 if (!f->writable)
98 return -EPERM;
99
100 if (!(f->fd >= 0 && f->header))
101 return -EINVAL;
102
103 if (f->header->state != STATE_ONLINE)
104 return 0;
105
106 fsync(f->fd);
107
108 f->header->state = STATE_OFFLINE;
109
110 fsync(f->fd);
111
112 return 0;
113}
114
cec736d2 115void journal_file_close(JournalFile *f) {
de190aef 116 assert(f);
cec736d2 117
feb12d3e 118#ifdef HAVE_GCRYPT
b0af6f41 119 /* Write the final tag */
c586dbf1 120 if (f->seal && f->writable)
b0af6f41 121 journal_file_append_tag(f);
feb12d3e 122#endif
b0af6f41 123
7560fffc 124 /* Sync everything to disk, before we mark the file offline */
16e9f408
LP
125 if (f->mmap && f->fd >= 0)
126 mmap_cache_close_fd(f->mmap, f->fd);
7560fffc 127
26687bf8 128 journal_file_set_offline(f);
cec736d2 129
26687bf8 130 if (f->header)
d384c7a8 131 munmap(f->header, PAGE_ALIGN(sizeof(Header)));
cec736d2 132
03e334a1 133 safe_close(f->fd);
cec736d2 134 free(f->path);
807e17f0 135
16e9f408
LP
136 if (f->mmap)
137 mmap_cache_unref(f->mmap);
138
4743015d 139 ordered_hashmap_free_free(f->chain_cache);
a4bcff5b 140
d89c8fdf 141#if defined(HAVE_XZ) || defined(HAVE_LZ4)
807e17f0
LP
142 free(f->compress_buffer);
143#endif
144
7560fffc 145#ifdef HAVE_GCRYPT
baed47c3
LP
146 if (f->fss_file)
147 munmap(f->fss_file, PAGE_ALIGN(f->fss_file_size));
b7c9ae91
LP
148 else if (f->fsprg_state)
149 free(f->fsprg_state);
150
151 free(f->fsprg_seed);
7560fffc
LP
152
153 if (f->hmac)
154 gcry_md_close(f->hmac);
155#endif
156
cec736d2
LP
157 free(f);
158}
159
0ac38b70 160static int journal_file_init_header(JournalFile *f, JournalFile *template) {
d89c8fdf 161 Header h = {};
cec736d2
LP
162 ssize_t k;
163 int r;
164
165 assert(f);
166
7560fffc 167 memcpy(h.signature, HEADER_SIGNATURE, 8);
23b0b2b2 168 h.header_size = htole64(ALIGN64(sizeof(h)));
cec736d2 169
d89c8fdf
ZJS
170 h.incompatible_flags |= htole32(
171 f->compress_xz * HEADER_INCOMPATIBLE_COMPRESSED_XZ |
172 f->compress_lz4 * HEADER_INCOMPATIBLE_COMPRESSED_LZ4);
7560fffc 173
d89c8fdf
ZJS
174 h.compatible_flags = htole32(
175 f->seal * HEADER_COMPATIBLE_SEALED);
7560fffc 176
cec736d2
LP
177 r = sd_id128_randomize(&h.file_id);
178 if (r < 0)
179 return r;
180
0ac38b70
LP
181 if (template) {
182 h.seqnum_id = template->header->seqnum_id;
beec0085 183 h.tail_entry_seqnum = template->header->tail_entry_seqnum;
0ac38b70
LP
184 } else
185 h.seqnum_id = h.file_id;
cec736d2
LP
186
187 k = pwrite(f->fd, &h, sizeof(h), 0);
188 if (k < 0)
189 return -errno;
190
191 if (k != sizeof(h))
192 return -EIO;
193
194 return 0;
195}
196
197static int journal_file_refresh_header(JournalFile *f) {
198 int r;
de190aef 199 sd_id128_t boot_id;
cec736d2
LP
200
201 assert(f);
202
203 r = sd_id128_get_machine(&f->header->machine_id);
204 if (r < 0)
205 return r;
206
de190aef 207 r = sd_id128_get_boot(&boot_id);
cec736d2
LP
208 if (r < 0)
209 return r;
210
de190aef
LP
211 if (sd_id128_equal(boot_id, f->header->boot_id))
212 f->tail_entry_monotonic_valid = true;
213
214 f->header->boot_id = boot_id;
215
26687bf8 216 journal_file_set_online(f);
b788cc23 217
7560fffc 218 /* Sync the online state to disk */
a676e665 219 fsync(f->fd);
b788cc23 220
cec736d2
LP
221 return 0;
222}
223
224static int journal_file_verify_header(JournalFile *f) {
d89c8fdf
ZJS
225 uint32_t flags;
226
cec736d2
LP
227 assert(f);
228
7560fffc 229 if (memcmp(f->header->signature, HEADER_SIGNATURE, 8))
cec736d2
LP
230 return -EBADMSG;
231
7560fffc
LP
232 /* In both read and write mode we refuse to open files with
233 * incompatible flags we don't know */
d89c8fdf
ZJS
234 flags = le32toh(f->header->incompatible_flags);
235 if (flags & ~HEADER_INCOMPATIBLE_SUPPORTED) {
236 if (flags & ~HEADER_INCOMPATIBLE_ANY)
237 log_debug("Journal file %s has unknown incompatible flags %"PRIx32,
238 f->path, flags & ~HEADER_INCOMPATIBLE_ANY);
239 flags = (flags & HEADER_INCOMPATIBLE_ANY) & ~HEADER_INCOMPATIBLE_SUPPORTED;
240 if (flags)
241 log_debug("Journal file %s uses incompatible flags %"PRIx32
242 " disabled at compilation time.", f->path, flags);
cec736d2 243 return -EPROTONOSUPPORT;
d89c8fdf 244 }
cec736d2 245
7560fffc
LP
246 /* When open for writing we refuse to open files with
247 * compatible flags, too */
d89c8fdf
ZJS
248 flags = le32toh(f->header->compatible_flags);
249 if (f->writable && (flags & ~HEADER_COMPATIBLE_SUPPORTED)) {
250 if (flags & ~HEADER_COMPATIBLE_ANY)
251 log_debug("Journal file %s has unknown compatible flags %"PRIx32,
252 f->path, flags & ~HEADER_COMPATIBLE_ANY);
253 flags = (flags & HEADER_COMPATIBLE_ANY) & ~HEADER_COMPATIBLE_SUPPORTED;
254 if (flags)
255 log_debug("Journal file %s uses compatible flags %"PRIx32
256 " disabled at compilation time.", f->path, flags);
257 return -EPROTONOSUPPORT;
7560fffc
LP
258 }
259
db11ac1a
LP
260 if (f->header->state >= _STATE_MAX)
261 return -EBADMSG;
262
dca6219e
LP
263 /* The first addition was n_data, so check that we are at least this large */
264 if (le64toh(f->header->header_size) < HEADER_SIZE_MIN)
23b0b2b2
LP
265 return -EBADMSG;
266
8088cbd3 267 if (JOURNAL_HEADER_SEALED(f->header) && !JOURNAL_HEADER_CONTAINS(f->header, n_entry_arrays))
beec0085
LP
268 return -EBADMSG;
269
db11ac1a
LP
270 if ((le64toh(f->header->header_size) + le64toh(f->header->arena_size)) > (uint64_t) f->last_stat.st_size)
271 return -ENODATA;
272
273 if (le64toh(f->header->tail_object_offset) > (le64toh(f->header->header_size) + le64toh(f->header->arena_size)))
274 return -ENODATA;
275
7762e02b
LP
276 if (!VALID64(le64toh(f->header->data_hash_table_offset)) ||
277 !VALID64(le64toh(f->header->field_hash_table_offset)) ||
278 !VALID64(le64toh(f->header->tail_object_offset)) ||
279 !VALID64(le64toh(f->header->entry_array_offset)))
280 return -ENODATA;
281
cec736d2 282 if (f->writable) {
ccdbaf91 283 uint8_t state;
cec736d2
LP
284 sd_id128_t machine_id;
285 int r;
286
287 r = sd_id128_get_machine(&machine_id);
288 if (r < 0)
289 return r;
290
291 if (!sd_id128_equal(machine_id, f->header->machine_id))
292 return -EHOSTDOWN;
293
de190aef 294 state = f->header->state;
cec736d2 295
71fa6f00
LP
296 if (state == STATE_ONLINE) {
297 log_debug("Journal file %s is already online. Assuming unclean closing.", f->path);
298 return -EBUSY;
299 } else if (state == STATE_ARCHIVED)
cec736d2 300 return -ESHUTDOWN;
71fa6f00
LP
301 else if (state != STATE_OFFLINE) {
302 log_debug("Journal file %s has unknown state %u.", f->path, state);
303 return -EBUSY;
304 }
cec736d2
LP
305 }
306
d89c8fdf
ZJS
307 f->compress_xz = JOURNAL_HEADER_COMPRESSED_XZ(f->header);
308 f->compress_lz4 = JOURNAL_HEADER_COMPRESSED_LZ4(f->header);
c586dbf1 309
f1889c91 310 f->seal = JOURNAL_HEADER_SEALED(f->header);
7560fffc 311
cec736d2
LP
312 return 0;
313}
314
315static int journal_file_allocate(JournalFile *f, uint64_t offset, uint64_t size) {
eda4b58b 316 uint64_t old_size, new_size;
fec2aa2f 317 int r;
cec736d2
LP
318
319 assert(f);
320
cec736d2 321 /* We assume that this file is not sparse, and we know that
38ac38b2 322 * for sure, since we always call posix_fallocate()
cec736d2
LP
323 * ourselves */
324
325 old_size =
23b0b2b2 326 le64toh(f->header->header_size) +
cec736d2
LP
327 le64toh(f->header->arena_size);
328
bc85bfee 329 new_size = PAGE_ALIGN(offset + size);
23b0b2b2
LP
330 if (new_size < le64toh(f->header->header_size))
331 new_size = le64toh(f->header->header_size);
bc85bfee
LP
332
333 if (new_size <= old_size)
cec736d2
LP
334 return 0;
335
a676e665 336 if (f->metrics.max_size > 0 && new_size > f->metrics.max_size)
bc85bfee 337 return -E2BIG;
cec736d2 338
a676e665 339 if (new_size > f->metrics.min_size && f->metrics.keep_free > 0) {
cec736d2
LP
340 struct statvfs svfs;
341
342 if (fstatvfs(f->fd, &svfs) >= 0) {
343 uint64_t available;
344
345 available = svfs.f_bfree * svfs.f_bsize;
346
bc85bfee
LP
347 if (available >= f->metrics.keep_free)
348 available -= f->metrics.keep_free;
cec736d2
LP
349 else
350 available = 0;
351
352 if (new_size - old_size > available)
353 return -E2BIG;
354 }
355 }
356
eda4b58b
LP
357 /* Increase by larger blocks at once */
358 new_size = ((new_size+FILE_SIZE_INCREASE-1) / FILE_SIZE_INCREASE) * FILE_SIZE_INCREASE;
359 if (f->metrics.max_size > 0 && new_size > f->metrics.max_size)
360 new_size = f->metrics.max_size;
361
bc85bfee
LP
362 /* Note that the glibc fallocate() fallback is very
363 inefficient, hence we try to minimize the allocation area
364 as we can. */
fec2aa2f
GV
365 r = posix_fallocate(f->fd, old_size, new_size - old_size);
366 if (r != 0)
367 return -r;
cec736d2 368
eda4b58b
LP
369 if (fstat(f->fd, &f->last_stat) < 0)
370 return -errno;
cec736d2 371
23b0b2b2 372 f->header->arena_size = htole64(new_size - le64toh(f->header->header_size));
cec736d2
LP
373
374 return 0;
375}
376
fcde2389 377static int journal_file_move_to(JournalFile *f, int context, bool keep_always, uint64_t offset, uint64_t size, void **ret) {
cec736d2 378 assert(f);
cec736d2
LP
379 assert(ret);
380
7762e02b
LP
381 if (size <= 0)
382 return -EINVAL;
383
2a59ea54 384 /* Avoid SIGBUS on invalid accesses */
4bbdcdb3
LP
385 if (offset + size > (uint64_t) f->last_stat.st_size) {
386 /* Hmm, out of range? Let's refresh the fstat() data
387 * first, before we trust that check. */
388
389 if (fstat(f->fd, &f->last_stat) < 0 ||
390 offset + size > (uint64_t) f->last_stat.st_size)
391 return -EADDRNOTAVAIL;
392 }
393
06cc69d4 394 return mmap_cache_get(f->mmap, f->fd, f->prot, context, keep_always, offset, size, &f->last_stat, ret, NULL);
cec736d2
LP
395}
396
16e9f408
LP
397static uint64_t minimum_header_size(Object *o) {
398
b8e891e6 399 static const uint64_t table[] = {
16e9f408
LP
400 [OBJECT_DATA] = sizeof(DataObject),
401 [OBJECT_FIELD] = sizeof(FieldObject),
402 [OBJECT_ENTRY] = sizeof(EntryObject),
403 [OBJECT_DATA_HASH_TABLE] = sizeof(HashTableObject),
404 [OBJECT_FIELD_HASH_TABLE] = sizeof(HashTableObject),
405 [OBJECT_ENTRY_ARRAY] = sizeof(EntryArrayObject),
406 [OBJECT_TAG] = sizeof(TagObject),
407 };
408
409 if (o->object.type >= ELEMENTSOF(table) || table[o->object.type] <= 0)
410 return sizeof(ObjectHeader);
411
412 return table[o->object.type];
413}
414
de190aef 415int journal_file_move_to_object(JournalFile *f, int type, uint64_t offset, Object **ret) {
cec736d2
LP
416 int r;
417 void *t;
418 Object *o;
419 uint64_t s;
420
421 assert(f);
422 assert(ret);
423
db11ac1a
LP
424 /* Objects may only be located at multiple of 64 bit */
425 if (!VALID64(offset))
426 return -EFAULT;
427
ae97089d 428 r = journal_file_move_to(f, type_to_context(type), false, offset, sizeof(ObjectHeader), &t);
cec736d2
LP
429 if (r < 0)
430 return r;
431
432 o = (Object*) t;
433 s = le64toh(o->object.size);
434
435 if (s < sizeof(ObjectHeader))
436 return -EBADMSG;
437
16e9f408
LP
438 if (o->object.type <= OBJECT_UNUSED)
439 return -EBADMSG;
440
441 if (s < minimum_header_size(o))
442 return -EBADMSG;
443
3c1668da 444 if (type > 0 && o->object.type != type)
cec736d2
LP
445 return -EBADMSG;
446
447 if (s > sizeof(ObjectHeader)) {
fcde2389 448 r = journal_file_move_to(f, o->object.type, false, offset, s, &t);
cec736d2
LP
449 if (r < 0)
450 return r;
451
452 o = (Object*) t;
453 }
454
cec736d2
LP
455 *ret = o;
456 return 0;
457}
458
d98cc1f2 459static uint64_t journal_file_entry_seqnum(JournalFile *f, uint64_t *seqnum) {
cec736d2
LP
460 uint64_t r;
461
462 assert(f);
463
beec0085 464 r = le64toh(f->header->tail_entry_seqnum) + 1;
c2373f84
LP
465
466 if (seqnum) {
de190aef 467 /* If an external seqnum counter was passed, we update
c2373f84
LP
468 * both the local and the external one, and set it to
469 * the maximum of both */
470
471 if (*seqnum + 1 > r)
472 r = *seqnum + 1;
473
474 *seqnum = r;
475 }
476
beec0085 477 f->header->tail_entry_seqnum = htole64(r);
cec736d2 478
beec0085
LP
479 if (f->header->head_entry_seqnum == 0)
480 f->header->head_entry_seqnum = htole64(r);
de190aef 481
cec736d2
LP
482 return r;
483}
484
0284adc6 485int journal_file_append_object(JournalFile *f, int type, uint64_t size, Object **ret, uint64_t *offset) {
cec736d2
LP
486 int r;
487 uint64_t p;
488 Object *tail, *o;
489 void *t;
490
491 assert(f);
16e9f408 492 assert(type > 0 && type < _OBJECT_TYPE_MAX);
cec736d2
LP
493 assert(size >= sizeof(ObjectHeader));
494 assert(offset);
495 assert(ret);
496
26687bf8
OS
497 r = journal_file_set_online(f);
498 if (r < 0)
499 return r;
500
cec736d2 501 p = le64toh(f->header->tail_object_offset);
cec736d2 502 if (p == 0)
23b0b2b2 503 p = le64toh(f->header->header_size);
cec736d2 504 else {
de190aef 505 r = journal_file_move_to_object(f, -1, p, &tail);
cec736d2
LP
506 if (r < 0)
507 return r;
508
509 p += ALIGN64(le64toh(tail->object.size));
510 }
511
512 r = journal_file_allocate(f, p, size);
513 if (r < 0)
514 return r;
515
fcde2389 516 r = journal_file_move_to(f, type, false, p, size, &t);
cec736d2
LP
517 if (r < 0)
518 return r;
519
520 o = (Object*) t;
521
522 zero(o->object);
de190aef 523 o->object.type = type;
cec736d2
LP
524 o->object.size = htole64(size);
525
526 f->header->tail_object_offset = htole64(p);
cec736d2
LP
527 f->header->n_objects = htole64(le64toh(f->header->n_objects) + 1);
528
529 *ret = o;
530 *offset = p;
531
532 return 0;
533}
534
de190aef 535static int journal_file_setup_data_hash_table(JournalFile *f) {
cec736d2
LP
536 uint64_t s, p;
537 Object *o;
538 int r;
539
540 assert(f);
541
dfabe643 542 /* We estimate that we need 1 hash table entry per 768 of
4a92baf3
LP
543 journal file and we want to make sure we never get beyond
544 75% fill level. Calculate the hash table size for the
545 maximum file size based on these metrics. */
546
dfabe643 547 s = (f->metrics.max_size * 4 / 768 / 3) * sizeof(HashItem);
4a92baf3
LP
548 if (s < DEFAULT_DATA_HASH_TABLE_SIZE)
549 s = DEFAULT_DATA_HASH_TABLE_SIZE;
550
507f22bd 551 log_debug("Reserving %"PRIu64" entries in hash table.", s / sizeof(HashItem));
4a92baf3 552
de190aef
LP
553 r = journal_file_append_object(f,
554 OBJECT_DATA_HASH_TABLE,
555 offsetof(Object, hash_table.items) + s,
556 &o, &p);
cec736d2
LP
557 if (r < 0)
558 return r;
559
29804cc1 560 memzero(o->hash_table.items, s);
cec736d2 561
de190aef
LP
562 f->header->data_hash_table_offset = htole64(p + offsetof(Object, hash_table.items));
563 f->header->data_hash_table_size = htole64(s);
cec736d2
LP
564
565 return 0;
566}
567
de190aef 568static int journal_file_setup_field_hash_table(JournalFile *f) {
cec736d2
LP
569 uint64_t s, p;
570 Object *o;
571 int r;
572
573 assert(f);
574
3c1668da
LP
575 /* We use a fixed size hash table for the fields as this
576 * number should grow very slowly only */
577
de190aef
LP
578 s = DEFAULT_FIELD_HASH_TABLE_SIZE;
579 r = journal_file_append_object(f,
580 OBJECT_FIELD_HASH_TABLE,
581 offsetof(Object, hash_table.items) + s,
582 &o, &p);
cec736d2
LP
583 if (r < 0)
584 return r;
585
29804cc1 586 memzero(o->hash_table.items, s);
cec736d2 587
de190aef
LP
588 f->header->field_hash_table_offset = htole64(p + offsetof(Object, hash_table.items));
589 f->header->field_hash_table_size = htole64(s);
cec736d2
LP
590
591 return 0;
592}
593
de190aef 594static int journal_file_map_data_hash_table(JournalFile *f) {
cec736d2
LP
595 uint64_t s, p;
596 void *t;
597 int r;
598
599 assert(f);
600
de190aef
LP
601 p = le64toh(f->header->data_hash_table_offset);
602 s = le64toh(f->header->data_hash_table_size);
cec736d2 603
de190aef 604 r = journal_file_move_to(f,
16e9f408 605 OBJECT_DATA_HASH_TABLE,
fcde2389 606 true,
de190aef
LP
607 p, s,
608 &t);
cec736d2
LP
609 if (r < 0)
610 return r;
611
de190aef 612 f->data_hash_table = t;
cec736d2
LP
613 return 0;
614}
615
de190aef 616static int journal_file_map_field_hash_table(JournalFile *f) {
cec736d2
LP
617 uint64_t s, p;
618 void *t;
619 int r;
620
621 assert(f);
622
de190aef
LP
623 p = le64toh(f->header->field_hash_table_offset);
624 s = le64toh(f->header->field_hash_table_size);
cec736d2 625
de190aef 626 r = journal_file_move_to(f,
16e9f408 627 OBJECT_FIELD_HASH_TABLE,
fcde2389 628 true,
de190aef
LP
629 p, s,
630 &t);
cec736d2
LP
631 if (r < 0)
632 return r;
633
de190aef 634 f->field_hash_table = t;
cec736d2
LP
635 return 0;
636}
637
3c1668da
LP
638static int journal_file_link_field(
639 JournalFile *f,
640 Object *o,
641 uint64_t offset,
642 uint64_t hash) {
643
644 uint64_t p, h;
645 int r;
646
647 assert(f);
648 assert(o);
649 assert(offset > 0);
650
651 if (o->object.type != OBJECT_FIELD)
652 return -EINVAL;
653
654 /* This might alter the window we are looking at */
655
656 o->field.next_hash_offset = o->field.head_data_offset = 0;
657
658 h = hash % (le64toh(f->header->field_hash_table_size) / sizeof(HashItem));
659 p = le64toh(f->field_hash_table[h].tail_hash_offset);
660 if (p == 0)
661 f->field_hash_table[h].head_hash_offset = htole64(offset);
662 else {
663 r = journal_file_move_to_object(f, OBJECT_FIELD, p, &o);
664 if (r < 0)
665 return r;
666
667 o->field.next_hash_offset = htole64(offset);
668 }
669
670 f->field_hash_table[h].tail_hash_offset = htole64(offset);
671
672 if (JOURNAL_HEADER_CONTAINS(f->header, n_fields))
673 f->header->n_fields = htole64(le64toh(f->header->n_fields) + 1);
674
675 return 0;
676}
677
678static int journal_file_link_data(
679 JournalFile *f,
680 Object *o,
681 uint64_t offset,
682 uint64_t hash) {
683
de190aef 684 uint64_t p, h;
cec736d2
LP
685 int r;
686
687 assert(f);
688 assert(o);
689 assert(offset > 0);
b588975f
LP
690
691 if (o->object.type != OBJECT_DATA)
692 return -EINVAL;
cec736d2 693
48496df6
LP
694 /* This might alter the window we are looking at */
695
de190aef
LP
696 o->data.next_hash_offset = o->data.next_field_offset = 0;
697 o->data.entry_offset = o->data.entry_array_offset = 0;
698 o->data.n_entries = 0;
cec736d2 699
de190aef 700 h = hash % (le64toh(f->header->data_hash_table_size) / sizeof(HashItem));
8db4213e 701 p = le64toh(f->data_hash_table[h].tail_hash_offset);
3c1668da 702 if (p == 0)
cec736d2 703 /* Only entry in the hash table is easy */
de190aef 704 f->data_hash_table[h].head_hash_offset = htole64(offset);
3c1668da 705 else {
48496df6
LP
706 /* Move back to the previous data object, to patch in
707 * pointer */
cec736d2 708
de190aef 709 r = journal_file_move_to_object(f, OBJECT_DATA, p, &o);
cec736d2
LP
710 if (r < 0)
711 return r;
712
de190aef 713 o->data.next_hash_offset = htole64(offset);
cec736d2
LP
714 }
715
de190aef 716 f->data_hash_table[h].tail_hash_offset = htole64(offset);
cec736d2 717
dca6219e
LP
718 if (JOURNAL_HEADER_CONTAINS(f->header, n_data))
719 f->header->n_data = htole64(le64toh(f->header->n_data) + 1);
720
cec736d2
LP
721 return 0;
722}
723
3c1668da
LP
724int journal_file_find_field_object_with_hash(
725 JournalFile *f,
726 const void *field, uint64_t size, uint64_t hash,
727 Object **ret, uint64_t *offset) {
728
729 uint64_t p, osize, h;
730 int r;
731
732 assert(f);
733 assert(field && size > 0);
734
735 osize = offsetof(Object, field.payload) + size;
736
737 if (f->header->field_hash_table_size == 0)
738 return -EBADMSG;
739
740 h = hash % (le64toh(f->header->field_hash_table_size) / sizeof(HashItem));
741 p = le64toh(f->field_hash_table[h].head_hash_offset);
742
743 while (p > 0) {
744 Object *o;
745
746 r = journal_file_move_to_object(f, OBJECT_FIELD, p, &o);
747 if (r < 0)
748 return r;
749
750 if (le64toh(o->field.hash) == hash &&
751 le64toh(o->object.size) == osize &&
752 memcmp(o->field.payload, field, size) == 0) {
753
754 if (ret)
755 *ret = o;
756 if (offset)
757 *offset = p;
758
759 return 1;
760 }
761
762 p = le64toh(o->field.next_hash_offset);
763 }
764
765 return 0;
766}
767
768int journal_file_find_field_object(
769 JournalFile *f,
770 const void *field, uint64_t size,
771 Object **ret, uint64_t *offset) {
772
773 uint64_t hash;
774
775 assert(f);
776 assert(field && size > 0);
777
778 hash = hash64(field, size);
779
780 return journal_file_find_field_object_with_hash(f,
781 field, size, hash,
782 ret, offset);
783}
784
de190aef
LP
785int journal_file_find_data_object_with_hash(
786 JournalFile *f,
787 const void *data, uint64_t size, uint64_t hash,
788 Object **ret, uint64_t *offset) {
48496df6 789
de190aef 790 uint64_t p, osize, h;
cec736d2
LP
791 int r;
792
793 assert(f);
794 assert(data || size == 0);
795
796 osize = offsetof(Object, data.payload) + size;
797
bc85bfee
LP
798 if (f->header->data_hash_table_size == 0)
799 return -EBADMSG;
800
de190aef
LP
801 h = hash % (le64toh(f->header->data_hash_table_size) / sizeof(HashItem));
802 p = le64toh(f->data_hash_table[h].head_hash_offset);
cec736d2 803
de190aef
LP
804 while (p > 0) {
805 Object *o;
cec736d2 806
de190aef 807 r = journal_file_move_to_object(f, OBJECT_DATA, p, &o);
cec736d2
LP
808 if (r < 0)
809 return r;
810
807e17f0 811 if (le64toh(o->data.hash) != hash)
85a131e8 812 goto next;
807e17f0 813
d89c8fdf 814 if (o->object.flags & OBJECT_COMPRESSION_MASK) {
3b1a55e1 815#if defined(HAVE_XZ) || defined(HAVE_LZ4)
fa1c4b51
ZJS
816 uint64_t l;
817 size_t rsize;
cec736d2 818
807e17f0
LP
819 l = le64toh(o->object.size);
820 if (l <= offsetof(Object, data.payload))
cec736d2
LP
821 return -EBADMSG;
822
807e17f0
LP
823 l -= offsetof(Object, data.payload);
824
d89c8fdf
ZJS
825 r = decompress_blob(o->object.flags & OBJECT_COMPRESSION_MASK,
826 o->data.payload, l, &f->compress_buffer, &f->compress_buffer_size, &rsize, 0);
827 if (r < 0)
828 return r;
807e17f0 829
b785c858 830 if (rsize == size &&
807e17f0
LP
831 memcmp(f->compress_buffer, data, size) == 0) {
832
833 if (ret)
834 *ret = o;
835
836 if (offset)
837 *offset = p;
838
839 return 1;
840 }
3b1a55e1
ZJS
841#else
842 return -EPROTONOSUPPORT;
843#endif
807e17f0
LP
844 } else if (le64toh(o->object.size) == osize &&
845 memcmp(o->data.payload, data, size) == 0) {
846
cec736d2
LP
847 if (ret)
848 *ret = o;
849
850 if (offset)
851 *offset = p;
852
de190aef 853 return 1;
cec736d2
LP
854 }
855
85a131e8 856 next:
cec736d2
LP
857 p = le64toh(o->data.next_hash_offset);
858 }
859
de190aef
LP
860 return 0;
861}
862
863int journal_file_find_data_object(
864 JournalFile *f,
865 const void *data, uint64_t size,
866 Object **ret, uint64_t *offset) {
867
868 uint64_t hash;
869
870 assert(f);
871 assert(data || size == 0);
872
873 hash = hash64(data, size);
874
875 return journal_file_find_data_object_with_hash(f,
876 data, size, hash,
877 ret, offset);
878}
879
3c1668da
LP
880static int journal_file_append_field(
881 JournalFile *f,
882 const void *field, uint64_t size,
883 Object **ret, uint64_t *offset) {
884
885 uint64_t hash, p;
886 uint64_t osize;
887 Object *o;
888 int r;
889
890 assert(f);
891 assert(field && size > 0);
892
893 hash = hash64(field, size);
894
895 r = journal_file_find_field_object_with_hash(f, field, size, hash, &o, &p);
896 if (r < 0)
897 return r;
898 else if (r > 0) {
899
900 if (ret)
901 *ret = o;
902
903 if (offset)
904 *offset = p;
905
906 return 0;
907 }
908
909 osize = offsetof(Object, field.payload) + size;
910 r = journal_file_append_object(f, OBJECT_FIELD, osize, &o, &p);
8c92d4bb
LP
911 if (r < 0)
912 return r;
3c1668da
LP
913
914 o->field.hash = htole64(hash);
915 memcpy(o->field.payload, field, size);
916
917 r = journal_file_link_field(f, o, p, hash);
918 if (r < 0)
919 return r;
920
921 /* The linking might have altered the window, so let's
922 * refresh our pointer */
923 r = journal_file_move_to_object(f, OBJECT_FIELD, p, &o);
924 if (r < 0)
925 return r;
926
927#ifdef HAVE_GCRYPT
928 r = journal_file_hmac_put_object(f, OBJECT_FIELD, o, p);
929 if (r < 0)
930 return r;
931#endif
932
933 if (ret)
934 *ret = o;
935
936 if (offset)
937 *offset = p;
938
939 return 0;
940}
941
48496df6
LP
942static int journal_file_append_data(
943 JournalFile *f,
944 const void *data, uint64_t size,
945 Object **ret, uint64_t *offset) {
946
de190aef
LP
947 uint64_t hash, p;
948 uint64_t osize;
949 Object *o;
d89c8fdf 950 int r, compression = 0;
3c1668da 951 const void *eq;
de190aef
LP
952
953 assert(f);
954 assert(data || size == 0);
955
956 hash = hash64(data, size);
957
958 r = journal_file_find_data_object_with_hash(f, data, size, hash, &o, &p);
959 if (r < 0)
960 return r;
961 else if (r > 0) {
962
963 if (ret)
964 *ret = o;
965
966 if (offset)
967 *offset = p;
968
969 return 0;
970 }
971
972 osize = offsetof(Object, data.payload) + size;
973 r = journal_file_append_object(f, OBJECT_DATA, osize, &o, &p);
cec736d2
LP
974 if (r < 0)
975 return r;
976
cec736d2 977 o->data.hash = htole64(hash);
807e17f0 978
d89c8fdf
ZJS
979#if defined(HAVE_XZ) || defined(HAVE_LZ4)
980 if (f->compress_xz &&
807e17f0 981 size >= COMPRESSION_SIZE_THRESHOLD) {
fa1c4b51 982 size_t rsize;
807e17f0 983
d89c8fdf 984 compression = compress_blob(data, size, o->data.payload, &rsize);
807e17f0 985
d89c8fdf 986 if (compression) {
807e17f0 987 o->object.size = htole64(offsetof(Object, data.payload) + rsize);
d89c8fdf 988 o->object.flags |= compression;
807e17f0 989
fa1c4b51 990 log_debug("Compressed data object %"PRIu64" -> %zu using %s",
d89c8fdf 991 size, rsize, object_compressed_to_string(compression));
807e17f0
LP
992 }
993 }
994#endif
995
d89c8fdf 996 if (!compression && size > 0)
807e17f0 997 memcpy(o->data.payload, data, size);
cec736d2 998
de190aef 999 r = journal_file_link_data(f, o, p, hash);
cec736d2
LP
1000 if (r < 0)
1001 return r;
1002
48496df6
LP
1003 /* The linking might have altered the window, so let's
1004 * refresh our pointer */
1005 r = journal_file_move_to_object(f, OBJECT_DATA, p, &o);
1006 if (r < 0)
1007 return r;
1008
08c6f819
SL
1009 if (!data)
1010 eq = NULL;
1011 else
1012 eq = memchr(data, '=', size);
3c1668da 1013 if (eq && eq > data) {
748db592 1014 Object *fo = NULL;
3c1668da 1015 uint64_t fp;
3c1668da
LP
1016
1017 /* Create field object ... */
1018 r = journal_file_append_field(f, data, (uint8_t*) eq - (uint8_t*) data, &fo, &fp);
1019 if (r < 0)
1020 return r;
1021
1022 /* ... and link it in. */
1023 o->data.next_field_offset = fo->field.head_data_offset;
1024 fo->field.head_data_offset = le64toh(p);
1025 }
1026
5996c7c2
LP
1027#ifdef HAVE_GCRYPT
1028 r = journal_file_hmac_put_object(f, OBJECT_DATA, o, p);
1029 if (r < 0)
1030 return r;
1031#endif
1032
cec736d2
LP
1033 if (ret)
1034 *ret = o;
1035
1036 if (offset)
de190aef 1037 *offset = p;
cec736d2
LP
1038
1039 return 0;
1040}
1041
1042uint64_t journal_file_entry_n_items(Object *o) {
1043 assert(o);
b588975f
LP
1044
1045 if (o->object.type != OBJECT_ENTRY)
1046 return 0;
cec736d2
LP
1047
1048 return (le64toh(o->object.size) - offsetof(Object, entry.items)) / sizeof(EntryItem);
1049}
1050
0284adc6 1051uint64_t journal_file_entry_array_n_items(Object *o) {
de190aef 1052 assert(o);
b588975f
LP
1053
1054 if (o->object.type != OBJECT_ENTRY_ARRAY)
1055 return 0;
de190aef
LP
1056
1057 return (le64toh(o->object.size) - offsetof(Object, entry_array.items)) / sizeof(uint64_t);
1058}
1059
fb9a24b6
LP
1060uint64_t journal_file_hash_table_n_items(Object *o) {
1061 assert(o);
b588975f
LP
1062
1063 if (o->object.type != OBJECT_DATA_HASH_TABLE &&
1064 o->object.type != OBJECT_FIELD_HASH_TABLE)
1065 return 0;
fb9a24b6
LP
1066
1067 return (le64toh(o->object.size) - offsetof(Object, hash_table.items)) / sizeof(HashItem);
1068}
1069
de190aef 1070static int link_entry_into_array(JournalFile *f,
4fd052ae
FC
1071 le64_t *first,
1072 le64_t *idx,
de190aef 1073 uint64_t p) {
cec736d2 1074 int r;
de190aef
LP
1075 uint64_t n = 0, ap = 0, q, i, a, hidx;
1076 Object *o;
1077
cec736d2 1078 assert(f);
de190aef
LP
1079 assert(first);
1080 assert(idx);
1081 assert(p > 0);
cec736d2 1082
de190aef
LP
1083 a = le64toh(*first);
1084 i = hidx = le64toh(*idx);
1085 while (a > 0) {
1086
1087 r = journal_file_move_to_object(f, OBJECT_ENTRY_ARRAY, a, &o);
1088 if (r < 0)
1089 return r;
cec736d2 1090
de190aef
LP
1091 n = journal_file_entry_array_n_items(o);
1092 if (i < n) {
1093 o->entry_array.items[i] = htole64(p);
1094 *idx = htole64(hidx + 1);
1095 return 0;
1096 }
cec736d2 1097
de190aef
LP
1098 i -= n;
1099 ap = a;
1100 a = le64toh(o->entry_array.next_entry_array_offset);
1101 }
1102
1103 if (hidx > n)
1104 n = (hidx+1) * 2;
1105 else
1106 n = n * 2;
1107
1108 if (n < 4)
1109 n = 4;
1110
1111 r = journal_file_append_object(f, OBJECT_ENTRY_ARRAY,
1112 offsetof(Object, entry_array.items) + n * sizeof(uint64_t),
1113 &o, &q);
cec736d2
LP
1114 if (r < 0)
1115 return r;
1116
feb12d3e 1117#ifdef HAVE_GCRYPT
5996c7c2 1118 r = journal_file_hmac_put_object(f, OBJECT_ENTRY_ARRAY, o, q);
b0af6f41
LP
1119 if (r < 0)
1120 return r;
feb12d3e 1121#endif
b0af6f41 1122
de190aef 1123 o->entry_array.items[i] = htole64(p);
cec736d2 1124
de190aef 1125 if (ap == 0)
7be3aa17 1126 *first = htole64(q);
cec736d2 1127 else {
de190aef 1128 r = journal_file_move_to_object(f, OBJECT_ENTRY_ARRAY, ap, &o);
cec736d2
LP
1129 if (r < 0)
1130 return r;
1131
de190aef
LP
1132 o->entry_array.next_entry_array_offset = htole64(q);
1133 }
cec736d2 1134
2dee23eb
LP
1135 if (JOURNAL_HEADER_CONTAINS(f->header, n_entry_arrays))
1136 f->header->n_entry_arrays = htole64(le64toh(f->header->n_entry_arrays) + 1);
1137
de190aef
LP
1138 *idx = htole64(hidx + 1);
1139
1140 return 0;
1141}
cec736d2 1142
de190aef 1143static int link_entry_into_array_plus_one(JournalFile *f,
4fd052ae
FC
1144 le64_t *extra,
1145 le64_t *first,
1146 le64_t *idx,
de190aef
LP
1147 uint64_t p) {
1148
1149 int r;
1150
1151 assert(f);
1152 assert(extra);
1153 assert(first);
1154 assert(idx);
1155 assert(p > 0);
1156
1157 if (*idx == 0)
1158 *extra = htole64(p);
1159 else {
4fd052ae 1160 le64_t i;
de190aef 1161
7be3aa17 1162 i = htole64(le64toh(*idx) - 1);
de190aef
LP
1163 r = link_entry_into_array(f, first, &i, p);
1164 if (r < 0)
1165 return r;
cec736d2
LP
1166 }
1167
de190aef
LP
1168 *idx = htole64(le64toh(*idx) + 1);
1169 return 0;
1170}
1171
1172static int journal_file_link_entry_item(JournalFile *f, Object *o, uint64_t offset, uint64_t i) {
1173 uint64_t p;
1174 int r;
1175 assert(f);
1176 assert(o);
1177 assert(offset > 0);
1178
1179 p = le64toh(o->entry.items[i].object_offset);
1180 if (p == 0)
1181 return -EINVAL;
1182
1183 r = journal_file_move_to_object(f, OBJECT_DATA, p, &o);
cec736d2
LP
1184 if (r < 0)
1185 return r;
1186
de190aef
LP
1187 return link_entry_into_array_plus_one(f,
1188 &o->data.entry_offset,
1189 &o->data.entry_array_offset,
1190 &o->data.n_entries,
1191 offset);
cec736d2
LP
1192}
1193
1194static int journal_file_link_entry(JournalFile *f, Object *o, uint64_t offset) {
de190aef 1195 uint64_t n, i;
cec736d2
LP
1196 int r;
1197
1198 assert(f);
1199 assert(o);
1200 assert(offset > 0);
b588975f
LP
1201
1202 if (o->object.type != OBJECT_ENTRY)
1203 return -EINVAL;
cec736d2 1204
b788cc23
LP
1205 __sync_synchronize();
1206
cec736d2 1207 /* Link up the entry itself */
de190aef
LP
1208 r = link_entry_into_array(f,
1209 &f->header->entry_array_offset,
1210 &f->header->n_entries,
1211 offset);
1212 if (r < 0)
1213 return r;
cec736d2 1214
507f22bd 1215 /* log_debug("=> %s seqnr=%"PRIu64" n_entries=%"PRIu64, f->path, o->entry.seqnum, f->header->n_entries); */
cec736d2 1216
de190aef 1217 if (f->header->head_entry_realtime == 0)
0ac38b70 1218 f->header->head_entry_realtime = o->entry.realtime;
cec736d2 1219
0ac38b70 1220 f->header->tail_entry_realtime = o->entry.realtime;
de190aef
LP
1221 f->header->tail_entry_monotonic = o->entry.monotonic;
1222
1223 f->tail_entry_monotonic_valid = true;
cec736d2
LP
1224
1225 /* Link up the items */
1226 n = journal_file_entry_n_items(o);
1227 for (i = 0; i < n; i++) {
1228 r = journal_file_link_entry_item(f, o, offset, i);
1229 if (r < 0)
1230 return r;
1231 }
1232
cec736d2
LP
1233 return 0;
1234}
1235
1236static int journal_file_append_entry_internal(
1237 JournalFile *f,
1238 const dual_timestamp *ts,
1239 uint64_t xor_hash,
1240 const EntryItem items[], unsigned n_items,
de190aef 1241 uint64_t *seqnum,
cec736d2
LP
1242 Object **ret, uint64_t *offset) {
1243 uint64_t np;
1244 uint64_t osize;
1245 Object *o;
1246 int r;
1247
1248 assert(f);
1249 assert(items || n_items == 0);
de190aef 1250 assert(ts);
cec736d2
LP
1251
1252 osize = offsetof(Object, entry.items) + (n_items * sizeof(EntryItem));
1253
de190aef 1254 r = journal_file_append_object(f, OBJECT_ENTRY, osize, &o, &np);
cec736d2
LP
1255 if (r < 0)
1256 return r;
1257
d98cc1f2 1258 o->entry.seqnum = htole64(journal_file_entry_seqnum(f, seqnum));
cec736d2 1259 memcpy(o->entry.items, items, n_items * sizeof(EntryItem));
de190aef
LP
1260 o->entry.realtime = htole64(ts->realtime);
1261 o->entry.monotonic = htole64(ts->monotonic);
cec736d2
LP
1262 o->entry.xor_hash = htole64(xor_hash);
1263 o->entry.boot_id = f->header->boot_id;
1264
feb12d3e 1265#ifdef HAVE_GCRYPT
5996c7c2 1266 r = journal_file_hmac_put_object(f, OBJECT_ENTRY, o, np);
b0af6f41
LP
1267 if (r < 0)
1268 return r;
feb12d3e 1269#endif
b0af6f41 1270
cec736d2
LP
1271 r = journal_file_link_entry(f, o, np);
1272 if (r < 0)
1273 return r;
1274
1275 if (ret)
1276 *ret = o;
1277
1278 if (offset)
1279 *offset = np;
1280
1281 return 0;
1282}
1283
cf244689 1284void journal_file_post_change(JournalFile *f) {
50f20cfd
LP
1285 assert(f);
1286
1287 /* inotify() does not receive IN_MODIFY events from file
1288 * accesses done via mmap(). After each access we hence
1289 * trigger IN_MODIFY by truncating the journal file to its
1290 * current size which triggers IN_MODIFY. */
1291
bc85bfee
LP
1292 __sync_synchronize();
1293
50f20cfd 1294 if (ftruncate(f->fd, f->last_stat.st_size) < 0)
56f64d95 1295 log_error_errno(errno, "Failed to truncate file to its own size: %m");
50f20cfd
LP
1296}
1297
1f2da9ec
LP
1298static int entry_item_cmp(const void *_a, const void *_b) {
1299 const EntryItem *a = _a, *b = _b;
1300
1301 if (le64toh(a->object_offset) < le64toh(b->object_offset))
1302 return -1;
1303 if (le64toh(a->object_offset) > le64toh(b->object_offset))
1304 return 1;
1305 return 0;
1306}
1307
de190aef 1308int journal_file_append_entry(JournalFile *f, const dual_timestamp *ts, const struct iovec iovec[], unsigned n_iovec, uint64_t *seqnum, Object **ret, uint64_t *offset) {
cec736d2
LP
1309 unsigned i;
1310 EntryItem *items;
1311 int r;
1312 uint64_t xor_hash = 0;
de190aef 1313 struct dual_timestamp _ts;
cec736d2
LP
1314
1315 assert(f);
1316 assert(iovec || n_iovec == 0);
1317
de190aef
LP
1318 if (!ts) {
1319 dual_timestamp_get(&_ts);
1320 ts = &_ts;
1321 }
1322
1323 if (f->tail_entry_monotonic_valid &&
1324 ts->monotonic < le64toh(f->header->tail_entry_monotonic))
1325 return -EINVAL;
1326
feb12d3e 1327#ifdef HAVE_GCRYPT
7560fffc
LP
1328 r = journal_file_maybe_append_tag(f, ts->realtime);
1329 if (r < 0)
1330 return r;
feb12d3e 1331#endif
7560fffc 1332
64825d3c 1333 /* alloca() can't take 0, hence let's allocate at least one */
9607d947 1334 items = alloca(sizeof(EntryItem) * MAX(1u, n_iovec));
cec736d2
LP
1335
1336 for (i = 0; i < n_iovec; i++) {
1337 uint64_t p;
1338 Object *o;
1339
1340 r = journal_file_append_data(f, iovec[i].iov_base, iovec[i].iov_len, &o, &p);
1341 if (r < 0)
cf244689 1342 return r;
cec736d2
LP
1343
1344 xor_hash ^= le64toh(o->data.hash);
1345 items[i].object_offset = htole64(p);
de7b95cd 1346 items[i].hash = o->data.hash;
cec736d2
LP
1347 }
1348
1f2da9ec
LP
1349 /* Order by the position on disk, in order to improve seek
1350 * times for rotating media. */
7ff7394d 1351 qsort_safe(items, n_iovec, sizeof(EntryItem), entry_item_cmp);
1f2da9ec 1352
de190aef 1353 r = journal_file_append_entry_internal(f, ts, xor_hash, items, n_iovec, seqnum, ret, offset);
cec736d2 1354
50f20cfd
LP
1355 journal_file_post_change(f);
1356
cec736d2
LP
1357 return r;
1358}
1359
a4bcff5b 1360typedef struct ChainCacheItem {
fb099c8d 1361 uint64_t first; /* the array at the beginning of the chain */
a4bcff5b
LP
1362 uint64_t array; /* the cached array */
1363 uint64_t begin; /* the first item in the cached array */
1364 uint64_t total; /* the total number of items in all arrays before this one in the chain */
f268980d 1365 uint64_t last_index; /* the last index we looked at, to optimize locality when bisecting */
a4bcff5b
LP
1366} ChainCacheItem;
1367
1368static void chain_cache_put(
4743015d 1369 OrderedHashmap *h,
a4bcff5b
LP
1370 ChainCacheItem *ci,
1371 uint64_t first,
1372 uint64_t array,
1373 uint64_t begin,
f268980d
LP
1374 uint64_t total,
1375 uint64_t last_index) {
a4bcff5b
LP
1376
1377 if (!ci) {
34741aa3
LP
1378 /* If the chain item to cache for this chain is the
1379 * first one it's not worth caching anything */
1380 if (array == first)
1381 return;
1382
29433089 1383 if (ordered_hashmap_size(h) >= CHAIN_CACHE_MAX) {
4743015d 1384 ci = ordered_hashmap_steal_first(h);
29433089
LP
1385 assert(ci);
1386 } else {
a4bcff5b
LP
1387 ci = new(ChainCacheItem, 1);
1388 if (!ci)
1389 return;
1390 }
1391
1392 ci->first = first;
1393
4743015d 1394 if (ordered_hashmap_put(h, &ci->first, ci) < 0) {
a4bcff5b
LP
1395 free(ci);
1396 return;
1397 }
1398 } else
1399 assert(ci->first == first);
1400
1401 ci->array = array;
1402 ci->begin = begin;
1403 ci->total = total;
f268980d 1404 ci->last_index = last_index;
a4bcff5b
LP
1405}
1406
f268980d
LP
1407static int generic_array_get(
1408 JournalFile *f,
1409 uint64_t first,
1410 uint64_t i,
1411 Object **ret, uint64_t *offset) {
de190aef 1412
cec736d2 1413 Object *o;
a4bcff5b 1414 uint64_t p = 0, a, t = 0;
cec736d2 1415 int r;
a4bcff5b 1416 ChainCacheItem *ci;
cec736d2
LP
1417
1418 assert(f);
1419
de190aef 1420 a = first;
a4bcff5b
LP
1421
1422 /* Try the chain cache first */
4743015d 1423 ci = ordered_hashmap_get(f->chain_cache, &first);
a4bcff5b
LP
1424 if (ci && i > ci->total) {
1425 a = ci->array;
1426 i -= ci->total;
1427 t = ci->total;
1428 }
1429
de190aef 1430 while (a > 0) {
a4bcff5b 1431 uint64_t k;
cec736d2 1432
de190aef
LP
1433 r = journal_file_move_to_object(f, OBJECT_ENTRY_ARRAY, a, &o);
1434 if (r < 0)
1435 return r;
cec736d2 1436
a4bcff5b
LP
1437 k = journal_file_entry_array_n_items(o);
1438 if (i < k) {
de190aef 1439 p = le64toh(o->entry_array.items[i]);
a4bcff5b 1440 goto found;
cec736d2
LP
1441 }
1442
a4bcff5b
LP
1443 i -= k;
1444 t += k;
de190aef
LP
1445 a = le64toh(o->entry_array.next_entry_array_offset);
1446 }
1447
a4bcff5b
LP
1448 return 0;
1449
1450found:
1451 /* Let's cache this item for the next invocation */
af13a6b0 1452 chain_cache_put(f->chain_cache, ci, first, a, le64toh(o->entry_array.items[0]), t, i);
de190aef
LP
1453
1454 r = journal_file_move_to_object(f, OBJECT_ENTRY, p, &o);
1455 if (r < 0)
1456 return r;
1457
1458 if (ret)
1459 *ret = o;
1460
1461 if (offset)
1462 *offset = p;
1463
1464 return 1;
1465}
1466
f268980d
LP
1467static int generic_array_get_plus_one(
1468 JournalFile *f,
1469 uint64_t extra,
1470 uint64_t first,
1471 uint64_t i,
1472 Object **ret, uint64_t *offset) {
de190aef
LP
1473
1474 Object *o;
1475
1476 assert(f);
1477
1478 if (i == 0) {
1479 int r;
1480
1481 r = journal_file_move_to_object(f, OBJECT_ENTRY, extra, &o);
cec736d2
LP
1482 if (r < 0)
1483 return r;
1484
de190aef
LP
1485 if (ret)
1486 *ret = o;
cec736d2 1487
de190aef
LP
1488 if (offset)
1489 *offset = extra;
cec736d2 1490
de190aef 1491 return 1;
cec736d2
LP
1492 }
1493
de190aef
LP
1494 return generic_array_get(f, first, i-1, ret, offset);
1495}
cec736d2 1496
de190aef
LP
1497enum {
1498 TEST_FOUND,
1499 TEST_LEFT,
1500 TEST_RIGHT
1501};
cec736d2 1502
f268980d
LP
1503static int generic_array_bisect(
1504 JournalFile *f,
1505 uint64_t first,
1506 uint64_t n,
1507 uint64_t needle,
1508 int (*test_object)(JournalFile *f, uint64_t p, uint64_t needle),
1509 direction_t direction,
1510 Object **ret,
1511 uint64_t *offset,
1512 uint64_t *idx) {
1513
1514 uint64_t a, p, t = 0, i = 0, last_p = 0, last_index = (uint64_t) -1;
de190aef
LP
1515 bool subtract_one = false;
1516 Object *o, *array = NULL;
1517 int r;
a4bcff5b 1518 ChainCacheItem *ci;
cec736d2 1519
de190aef
LP
1520 assert(f);
1521 assert(test_object);
cec736d2 1522
a4bcff5b 1523 /* Start with the first array in the chain */
de190aef 1524 a = first;
a4bcff5b 1525
4743015d 1526 ci = ordered_hashmap_get(f->chain_cache, &first);
a4bcff5b
LP
1527 if (ci && n > ci->total) {
1528 /* Ah, we have iterated this bisection array chain
1529 * previously! Let's see if we can skip ahead in the
1530 * chain, as far as the last time. But we can't jump
1531 * backwards in the chain, so let's check that
1532 * first. */
1533
1534 r = test_object(f, ci->begin, needle);
1535 if (r < 0)
1536 return r;
1537
1538 if (r == TEST_LEFT) {
f268980d 1539 /* OK, what we are looking for is right of the
a4bcff5b
LP
1540 * begin of this EntryArray, so let's jump
1541 * straight to previously cached array in the
1542 * chain */
1543
1544 a = ci->array;
1545 n -= ci->total;
1546 t = ci->total;
f268980d 1547 last_index = ci->last_index;
a4bcff5b
LP
1548 }
1549 }
1550
de190aef
LP
1551 while (a > 0) {
1552 uint64_t left, right, k, lp;
1553
1554 r = journal_file_move_to_object(f, OBJECT_ENTRY_ARRAY, a, &array);
cec736d2
LP
1555 if (r < 0)
1556 return r;
1557
de190aef
LP
1558 k = journal_file_entry_array_n_items(array);
1559 right = MIN(k, n);
1560 if (right <= 0)
1561 return 0;
cec736d2 1562
de190aef
LP
1563 i = right - 1;
1564 lp = p = le64toh(array->entry_array.items[i]);
1565 if (p <= 0)
1566 return -EBADMSG;
cec736d2 1567
de190aef
LP
1568 r = test_object(f, p, needle);
1569 if (r < 0)
1570 return r;
cec736d2 1571
de190aef
LP
1572 if (r == TEST_FOUND)
1573 r = direction == DIRECTION_DOWN ? TEST_RIGHT : TEST_LEFT;
1574
1575 if (r == TEST_RIGHT) {
1576 left = 0;
1577 right -= 1;
f268980d
LP
1578
1579 if (last_index != (uint64_t) -1) {
1580 assert(last_index <= right);
1581
1582 /* If we cached the last index we
1583 * looked at, let's try to not to jump
1584 * too wildly around and see if we can
1585 * limit the range to look at early to
1586 * the immediate neighbors of the last
1587 * index we looked at. */
1588
1589 if (last_index > 0) {
1590 uint64_t x = last_index - 1;
1591
1592 p = le64toh(array->entry_array.items[x]);
1593 if (p <= 0)
1594 return -EBADMSG;
1595
1596 r = test_object(f, p, needle);
1597 if (r < 0)
1598 return r;
1599
1600 if (r == TEST_FOUND)
1601 r = direction == DIRECTION_DOWN ? TEST_RIGHT : TEST_LEFT;
1602
1603 if (r == TEST_RIGHT)
1604 right = x;
1605 else
1606 left = x + 1;
1607 }
1608
1609 if (last_index < right) {
1610 uint64_t y = last_index + 1;
1611
1612 p = le64toh(array->entry_array.items[y]);
1613 if (p <= 0)
1614 return -EBADMSG;
1615
1616 r = test_object(f, p, needle);
1617 if (r < 0)
1618 return r;
1619
1620 if (r == TEST_FOUND)
1621 r = direction == DIRECTION_DOWN ? TEST_RIGHT : TEST_LEFT;
1622
1623 if (r == TEST_RIGHT)
1624 right = y;
1625 else
1626 left = y + 1;
1627 }
f268980d
LP
1628 }
1629
de190aef
LP
1630 for (;;) {
1631 if (left == right) {
1632 if (direction == DIRECTION_UP)
1633 subtract_one = true;
1634
1635 i = left;
1636 goto found;
1637 }
1638
1639 assert(left < right);
de190aef 1640 i = (left + right) / 2;
f268980d 1641
de190aef
LP
1642 p = le64toh(array->entry_array.items[i]);
1643 if (p <= 0)
1644 return -EBADMSG;
1645
1646 r = test_object(f, p, needle);
1647 if (r < 0)
1648 return r;
cec736d2 1649
de190aef
LP
1650 if (r == TEST_FOUND)
1651 r = direction == DIRECTION_DOWN ? TEST_RIGHT : TEST_LEFT;
1652
1653 if (r == TEST_RIGHT)
1654 right = i;
1655 else
1656 left = i + 1;
1657 }
1658 }
1659
cbdca852
LP
1660 if (k > n) {
1661 if (direction == DIRECTION_UP) {
1662 i = n;
1663 subtract_one = true;
1664 goto found;
1665 }
1666
cec736d2 1667 return 0;
cbdca852 1668 }
cec736d2 1669
de190aef
LP
1670 last_p = lp;
1671
1672 n -= k;
1673 t += k;
f268980d 1674 last_index = (uint64_t) -1;
de190aef 1675 a = le64toh(array->entry_array.next_entry_array_offset);
cec736d2
LP
1676 }
1677
1678 return 0;
de190aef
LP
1679
1680found:
1681 if (subtract_one && t == 0 && i == 0)
1682 return 0;
1683
a4bcff5b 1684 /* Let's cache this item for the next invocation */
af13a6b0 1685 chain_cache_put(f->chain_cache, ci, first, a, le64toh(array->entry_array.items[0]), t, subtract_one ? (i > 0 ? i-1 : (uint64_t) -1) : i);
a4bcff5b 1686
de190aef
LP
1687 if (subtract_one && i == 0)
1688 p = last_p;
1689 else if (subtract_one)
1690 p = le64toh(array->entry_array.items[i-1]);
1691 else
1692 p = le64toh(array->entry_array.items[i]);
1693
1694 r = journal_file_move_to_object(f, OBJECT_ENTRY, p, &o);
1695 if (r < 0)
1696 return r;
1697
1698 if (ret)
1699 *ret = o;
1700
1701 if (offset)
1702 *offset = p;
1703
1704 if (idx)
cbdca852 1705 *idx = t + i + (subtract_one ? -1 : 0);
de190aef
LP
1706
1707 return 1;
cec736d2
LP
1708}
1709
f268980d
LP
1710
1711static int generic_array_bisect_plus_one(
1712 JournalFile *f,
1713 uint64_t extra,
1714 uint64_t first,
1715 uint64_t n,
1716 uint64_t needle,
1717 int (*test_object)(JournalFile *f, uint64_t p, uint64_t needle),
1718 direction_t direction,
1719 Object **ret,
1720 uint64_t *offset,
1721 uint64_t *idx) {
de190aef 1722
cec736d2 1723 int r;
cbdca852
LP
1724 bool step_back = false;
1725 Object *o;
cec736d2
LP
1726
1727 assert(f);
de190aef 1728 assert(test_object);
cec736d2 1729
de190aef
LP
1730 if (n <= 0)
1731 return 0;
cec736d2 1732
de190aef
LP
1733 /* This bisects the array in object 'first', but first checks
1734 * an extra */
de190aef
LP
1735 r = test_object(f, extra, needle);
1736 if (r < 0)
1737 return r;
a536e261
LP
1738
1739 if (r == TEST_FOUND)
1740 r = direction == DIRECTION_DOWN ? TEST_RIGHT : TEST_LEFT;
1741
cbdca852
LP
1742 /* if we are looking with DIRECTION_UP then we need to first
1743 see if in the actual array there is a matching entry, and
1744 return the last one of that. But if there isn't any we need
1745 to return this one. Hence remember this, and return it
1746 below. */
1747 if (r == TEST_LEFT)
1748 step_back = direction == DIRECTION_UP;
de190aef 1749
cbdca852
LP
1750 if (r == TEST_RIGHT) {
1751 if (direction == DIRECTION_DOWN)
1752 goto found;
1753 else
1754 return 0;
a536e261 1755 }
cec736d2 1756
de190aef
LP
1757 r = generic_array_bisect(f, first, n-1, needle, test_object, direction, ret, offset, idx);
1758
cbdca852
LP
1759 if (r == 0 && step_back)
1760 goto found;
1761
ecf68b1d 1762 if (r > 0 && idx)
de190aef
LP
1763 (*idx) ++;
1764
1765 return r;
cbdca852
LP
1766
1767found:
1768 r = journal_file_move_to_object(f, OBJECT_ENTRY, extra, &o);
1769 if (r < 0)
1770 return r;
1771
1772 if (ret)
1773 *ret = o;
1774
1775 if (offset)
1776 *offset = extra;
1777
1778 if (idx)
1779 *idx = 0;
1780
1781 return 1;
1782}
1783
44a6b1b6 1784_pure_ static int test_object_offset(JournalFile *f, uint64_t p, uint64_t needle) {
cbdca852
LP
1785 assert(f);
1786 assert(p > 0);
1787
1788 if (p == needle)
1789 return TEST_FOUND;
1790 else if (p < needle)
1791 return TEST_LEFT;
1792 else
1793 return TEST_RIGHT;
1794}
1795
1796int journal_file_move_to_entry_by_offset(
1797 JournalFile *f,
1798 uint64_t p,
1799 direction_t direction,
1800 Object **ret,
1801 uint64_t *offset) {
1802
1803 return generic_array_bisect(f,
1804 le64toh(f->header->entry_array_offset),
1805 le64toh(f->header->n_entries),
1806 p,
1807 test_object_offset,
1808 direction,
1809 ret, offset, NULL);
de190aef
LP
1810}
1811
cbdca852 1812
de190aef
LP
1813static int test_object_seqnum(JournalFile *f, uint64_t p, uint64_t needle) {
1814 Object *o;
1815 int r;
1816
1817 assert(f);
1818 assert(p > 0);
1819
1820 r = journal_file_move_to_object(f, OBJECT_ENTRY, p, &o);
cec736d2
LP
1821 if (r < 0)
1822 return r;
1823
de190aef
LP
1824 if (le64toh(o->entry.seqnum) == needle)
1825 return TEST_FOUND;
1826 else if (le64toh(o->entry.seqnum) < needle)
1827 return TEST_LEFT;
1828 else
1829 return TEST_RIGHT;
1830}
cec736d2 1831
de190aef
LP
1832int journal_file_move_to_entry_by_seqnum(
1833 JournalFile *f,
1834 uint64_t seqnum,
1835 direction_t direction,
1836 Object **ret,
1837 uint64_t *offset) {
1838
1839 return generic_array_bisect(f,
1840 le64toh(f->header->entry_array_offset),
1841 le64toh(f->header->n_entries),
1842 seqnum,
1843 test_object_seqnum,
1844 direction,
1845 ret, offset, NULL);
1846}
cec736d2 1847
de190aef
LP
1848static int test_object_realtime(JournalFile *f, uint64_t p, uint64_t needle) {
1849 Object *o;
1850 int r;
1851
1852 assert(f);
1853 assert(p > 0);
1854
1855 r = journal_file_move_to_object(f, OBJECT_ENTRY, p, &o);
1856 if (r < 0)
1857 return r;
1858
1859 if (le64toh(o->entry.realtime) == needle)
1860 return TEST_FOUND;
1861 else if (le64toh(o->entry.realtime) < needle)
1862 return TEST_LEFT;
1863 else
1864 return TEST_RIGHT;
cec736d2
LP
1865}
1866
de190aef
LP
1867int journal_file_move_to_entry_by_realtime(
1868 JournalFile *f,
1869 uint64_t realtime,
1870 direction_t direction,
1871 Object **ret,
1872 uint64_t *offset) {
1873
1874 return generic_array_bisect(f,
1875 le64toh(f->header->entry_array_offset),
1876 le64toh(f->header->n_entries),
1877 realtime,
1878 test_object_realtime,
1879 direction,
1880 ret, offset, NULL);
1881}
1882
1883static int test_object_monotonic(JournalFile *f, uint64_t p, uint64_t needle) {
1884 Object *o;
1885 int r;
1886
1887 assert(f);
1888 assert(p > 0);
1889
1890 r = journal_file_move_to_object(f, OBJECT_ENTRY, p, &o);
1891 if (r < 0)
1892 return r;
1893
1894 if (le64toh(o->entry.monotonic) == needle)
1895 return TEST_FOUND;
1896 else if (le64toh(o->entry.monotonic) < needle)
1897 return TEST_LEFT;
1898 else
1899 return TEST_RIGHT;
1900}
1901
47838ab3
ZJS
1902static inline int find_data_object_by_boot_id(
1903 JournalFile *f,
1904 sd_id128_t boot_id,
1905 Object **o,
1906 uint64_t *b) {
1907 char t[sizeof("_BOOT_ID=")-1 + 32 + 1] = "_BOOT_ID=";
1908
1909 sd_id128_to_string(boot_id, t + 9);
1910 return journal_file_find_data_object(f, t, sizeof(t) - 1, o, b);
1911}
1912
de190aef
LP
1913int journal_file_move_to_entry_by_monotonic(
1914 JournalFile *f,
1915 sd_id128_t boot_id,
1916 uint64_t monotonic,
1917 direction_t direction,
1918 Object **ret,
1919 uint64_t *offset) {
1920
de190aef
LP
1921 Object *o;
1922 int r;
1923
cbdca852 1924 assert(f);
de190aef 1925
47838ab3 1926 r = find_data_object_by_boot_id(f, boot_id, &o, NULL);
de190aef
LP
1927 if (r < 0)
1928 return r;
cbdca852 1929 if (r == 0)
de190aef
LP
1930 return -ENOENT;
1931
1932 return generic_array_bisect_plus_one(f,
1933 le64toh(o->data.entry_offset),
1934 le64toh(o->data.entry_array_offset),
1935 le64toh(o->data.n_entries),
1936 monotonic,
1937 test_object_monotonic,
1938 direction,
1939 ret, offset, NULL);
1940}
1941
de190aef
LP
1942int journal_file_next_entry(
1943 JournalFile *f,
1944 Object *o, uint64_t p,
1945 direction_t direction,
1946 Object **ret, uint64_t *offset) {
1947
fb099c8d 1948 uint64_t i, n, ofs;
cec736d2
LP
1949 int r;
1950
1951 assert(f);
de190aef
LP
1952 assert(p > 0 || !o);
1953
1954 n = le64toh(f->header->n_entries);
1955 if (n <= 0)
1956 return 0;
cec736d2
LP
1957
1958 if (!o)
de190aef 1959 i = direction == DIRECTION_DOWN ? 0 : n - 1;
cec736d2 1960 else {
de190aef 1961 if (o->object.type != OBJECT_ENTRY)
cec736d2
LP
1962 return -EINVAL;
1963
de190aef
LP
1964 r = generic_array_bisect(f,
1965 le64toh(f->header->entry_array_offset),
1966 le64toh(f->header->n_entries),
1967 p,
1968 test_object_offset,
1969 DIRECTION_DOWN,
1970 NULL, NULL,
1971 &i);
1972 if (r <= 0)
1973 return r;
1974
1975 if (direction == DIRECTION_DOWN) {
1976 if (i >= n - 1)
1977 return 0;
1978
1979 i++;
1980 } else {
1981 if (i <= 0)
1982 return 0;
1983
1984 i--;
1985 }
cec736d2
LP
1986 }
1987
de190aef 1988 /* And jump to it */
fb099c8d
ZJS
1989 r = generic_array_get(f,
1990 le64toh(f->header->entry_array_offset),
1991 i,
1992 ret, &ofs);
1993 if (r <= 0)
1994 return r;
1995
1996 if (p > 0 &&
1997 (direction == DIRECTION_DOWN ? ofs <= p : ofs >= p)) {
1998 log_debug("%s: entry array corrupted at entry %"PRIu64,
1999 f->path, i);
2000 return -EBADMSG;
2001 }
2002
2003 if (offset)
2004 *offset = ofs;
2005
2006 return 1;
de190aef 2007}
cec736d2 2008
de190aef
LP
2009int journal_file_skip_entry(
2010 JournalFile *f,
2011 Object *o, uint64_t p,
2012 int64_t skip,
2013 Object **ret, uint64_t *offset) {
2014
2015 uint64_t i, n;
2016 int r;
2017
2018 assert(f);
2019 assert(o);
2020 assert(p > 0);
2021
2022 if (o->object.type != OBJECT_ENTRY)
2023 return -EINVAL;
2024
2025 r = generic_array_bisect(f,
2026 le64toh(f->header->entry_array_offset),
2027 le64toh(f->header->n_entries),
2028 p,
2029 test_object_offset,
2030 DIRECTION_DOWN,
2031 NULL, NULL,
2032 &i);
2033 if (r <= 0)
cec736d2
LP
2034 return r;
2035
de190aef
LP
2036 /* Calculate new index */
2037 if (skip < 0) {
2038 if ((uint64_t) -skip >= i)
2039 i = 0;
2040 else
2041 i = i - (uint64_t) -skip;
2042 } else
2043 i += (uint64_t) skip;
cec736d2 2044
de190aef
LP
2045 n = le64toh(f->header->n_entries);
2046 if (n <= 0)
2047 return -EBADMSG;
cec736d2 2048
de190aef
LP
2049 if (i >= n)
2050 i = n-1;
2051
2052 return generic_array_get(f,
2053 le64toh(f->header->entry_array_offset),
2054 i,
2055 ret, offset);
cec736d2
LP
2056}
2057
de190aef
LP
2058int journal_file_next_entry_for_data(
2059 JournalFile *f,
2060 Object *o, uint64_t p,
2061 uint64_t data_offset,
2062 direction_t direction,
2063 Object **ret, uint64_t *offset) {
2064
2065 uint64_t n, i;
cec736d2 2066 int r;
de190aef 2067 Object *d;
cec736d2
LP
2068
2069 assert(f);
de190aef 2070 assert(p > 0 || !o);
cec736d2 2071
de190aef 2072 r = journal_file_move_to_object(f, OBJECT_DATA, data_offset, &d);
466ccd92 2073 if (r < 0)
de190aef 2074 return r;
cec736d2 2075
de190aef
LP
2076 n = le64toh(d->data.n_entries);
2077 if (n <= 0)
2078 return n;
cec736d2 2079
de190aef
LP
2080 if (!o)
2081 i = direction == DIRECTION_DOWN ? 0 : n - 1;
2082 else {
2083 if (o->object.type != OBJECT_ENTRY)
2084 return -EINVAL;
cec736d2 2085
de190aef
LP
2086 r = generic_array_bisect_plus_one(f,
2087 le64toh(d->data.entry_offset),
2088 le64toh(d->data.entry_array_offset),
2089 le64toh(d->data.n_entries),
2090 p,
2091 test_object_offset,
2092 DIRECTION_DOWN,
2093 NULL, NULL,
2094 &i);
2095
2096 if (r <= 0)
cec736d2
LP
2097 return r;
2098
de190aef
LP
2099 if (direction == DIRECTION_DOWN) {
2100 if (i >= n - 1)
2101 return 0;
cec736d2 2102
de190aef
LP
2103 i++;
2104 } else {
2105 if (i <= 0)
2106 return 0;
cec736d2 2107
de190aef
LP
2108 i--;
2109 }
cec736d2 2110
de190aef 2111 }
cec736d2 2112
de190aef
LP
2113 return generic_array_get_plus_one(f,
2114 le64toh(d->data.entry_offset),
2115 le64toh(d->data.entry_array_offset),
2116 i,
2117 ret, offset);
2118}
cec736d2 2119
cbdca852
LP
2120int journal_file_move_to_entry_by_offset_for_data(
2121 JournalFile *f,
2122 uint64_t data_offset,
2123 uint64_t p,
2124 direction_t direction,
2125 Object **ret, uint64_t *offset) {
2126
2127 int r;
2128 Object *d;
2129
2130 assert(f);
2131
2132 r = journal_file_move_to_object(f, OBJECT_DATA, data_offset, &d);
2133 if (r < 0)
2134 return r;
2135
2136 return generic_array_bisect_plus_one(f,
2137 le64toh(d->data.entry_offset),
2138 le64toh(d->data.entry_array_offset),
2139 le64toh(d->data.n_entries),
2140 p,
2141 test_object_offset,
2142 direction,
2143 ret, offset, NULL);
2144}
2145
2146int journal_file_move_to_entry_by_monotonic_for_data(
2147 JournalFile *f,
2148 uint64_t data_offset,
2149 sd_id128_t boot_id,
2150 uint64_t monotonic,
2151 direction_t direction,
2152 Object **ret, uint64_t *offset) {
2153
cbdca852
LP
2154 Object *o, *d;
2155 int r;
2156 uint64_t b, z;
2157
2158 assert(f);
2159
2160 /* First, seek by time */
47838ab3 2161 r = find_data_object_by_boot_id(f, boot_id, &o, &b);
cbdca852
LP
2162 if (r < 0)
2163 return r;
2164 if (r == 0)
2165 return -ENOENT;
2166
2167 r = generic_array_bisect_plus_one(f,
2168 le64toh(o->data.entry_offset),
2169 le64toh(o->data.entry_array_offset),
2170 le64toh(o->data.n_entries),
2171 monotonic,
2172 test_object_monotonic,
2173 direction,
2174 NULL, &z, NULL);
2175 if (r <= 0)
2176 return r;
2177
2178 /* And now, continue seeking until we find an entry that
2179 * exists in both bisection arrays */
2180
2181 for (;;) {
2182 Object *qo;
2183 uint64_t p, q;
2184
2185 r = journal_file_move_to_object(f, OBJECT_DATA, data_offset, &d);
2186 if (r < 0)
2187 return r;
2188
2189 r = generic_array_bisect_plus_one(f,
2190 le64toh(d->data.entry_offset),
2191 le64toh(d->data.entry_array_offset),
2192 le64toh(d->data.n_entries),
2193 z,
2194 test_object_offset,
2195 direction,
2196 NULL, &p, NULL);
2197 if (r <= 0)
2198 return r;
2199
2200 r = journal_file_move_to_object(f, OBJECT_DATA, b, &o);
2201 if (r < 0)
2202 return r;
2203
2204 r = generic_array_bisect_plus_one(f,
2205 le64toh(o->data.entry_offset),
2206 le64toh(o->data.entry_array_offset),
2207 le64toh(o->data.n_entries),
2208 p,
2209 test_object_offset,
2210 direction,
2211 &qo, &q, NULL);
2212
2213 if (r <= 0)
2214 return r;
2215
2216 if (p == q) {
2217 if (ret)
2218 *ret = qo;
2219 if (offset)
2220 *offset = q;
2221
2222 return 1;
2223 }
2224
2225 z = q;
2226 }
cbdca852
LP
2227}
2228
de190aef
LP
2229int journal_file_move_to_entry_by_seqnum_for_data(
2230 JournalFile *f,
2231 uint64_t data_offset,
2232 uint64_t seqnum,
2233 direction_t direction,
2234 Object **ret, uint64_t *offset) {
cec736d2 2235
de190aef
LP
2236 Object *d;
2237 int r;
cec736d2 2238
91a31dde
LP
2239 assert(f);
2240
de190aef 2241 r = journal_file_move_to_object(f, OBJECT_DATA, data_offset, &d);
91a31dde 2242 if (r < 0)
de190aef 2243 return r;
cec736d2 2244
de190aef
LP
2245 return generic_array_bisect_plus_one(f,
2246 le64toh(d->data.entry_offset),
2247 le64toh(d->data.entry_array_offset),
2248 le64toh(d->data.n_entries),
2249 seqnum,
2250 test_object_seqnum,
2251 direction,
2252 ret, offset, NULL);
2253}
cec736d2 2254
de190aef
LP
2255int journal_file_move_to_entry_by_realtime_for_data(
2256 JournalFile *f,
2257 uint64_t data_offset,
2258 uint64_t realtime,
2259 direction_t direction,
2260 Object **ret, uint64_t *offset) {
2261
2262 Object *d;
2263 int r;
2264
91a31dde
LP
2265 assert(f);
2266
de190aef 2267 r = journal_file_move_to_object(f, OBJECT_DATA, data_offset, &d);
91a31dde 2268 if (r < 0)
de190aef
LP
2269 return r;
2270
2271 return generic_array_bisect_plus_one(f,
2272 le64toh(d->data.entry_offset),
2273 le64toh(d->data.entry_array_offset),
2274 le64toh(d->data.n_entries),
2275 realtime,
2276 test_object_realtime,
2277 direction,
2278 ret, offset, NULL);
cec736d2
LP
2279}
2280
0284adc6 2281void journal_file_dump(JournalFile *f) {
7560fffc 2282 Object *o;
7560fffc 2283 int r;
0284adc6 2284 uint64_t p;
7560fffc
LP
2285
2286 assert(f);
2287
0284adc6 2288 journal_file_print_header(f);
7560fffc 2289
0284adc6
LP
2290 p = le64toh(f->header->header_size);
2291 while (p != 0) {
2292 r = journal_file_move_to_object(f, -1, p, &o);
2293 if (r < 0)
2294 goto fail;
7560fffc 2295
0284adc6 2296 switch (o->object.type) {
d98cc1f2 2297
0284adc6
LP
2298 case OBJECT_UNUSED:
2299 printf("Type: OBJECT_UNUSED\n");
2300 break;
d98cc1f2 2301
0284adc6
LP
2302 case OBJECT_DATA:
2303 printf("Type: OBJECT_DATA\n");
2304 break;
7560fffc 2305
3c1668da
LP
2306 case OBJECT_FIELD:
2307 printf("Type: OBJECT_FIELD\n");
2308 break;
2309
0284adc6 2310 case OBJECT_ENTRY:
507f22bd
ZJS
2311 printf("Type: OBJECT_ENTRY seqnum=%"PRIu64" monotonic=%"PRIu64" realtime=%"PRIu64"\n",
2312 le64toh(o->entry.seqnum),
2313 le64toh(o->entry.monotonic),
2314 le64toh(o->entry.realtime));
0284adc6 2315 break;
7560fffc 2316
0284adc6
LP
2317 case OBJECT_FIELD_HASH_TABLE:
2318 printf("Type: OBJECT_FIELD_HASH_TABLE\n");
2319 break;
7560fffc 2320
0284adc6
LP
2321 case OBJECT_DATA_HASH_TABLE:
2322 printf("Type: OBJECT_DATA_HASH_TABLE\n");
2323 break;
7560fffc 2324
0284adc6
LP
2325 case OBJECT_ENTRY_ARRAY:
2326 printf("Type: OBJECT_ENTRY_ARRAY\n");
2327 break;
7560fffc 2328
0284adc6 2329 case OBJECT_TAG:
507f22bd
ZJS
2330 printf("Type: OBJECT_TAG seqnum=%"PRIu64" epoch=%"PRIu64"\n",
2331 le64toh(o->tag.seqnum),
2332 le64toh(o->tag.epoch));
0284adc6 2333 break;
3c1668da
LP
2334
2335 default:
2336 printf("Type: unknown (%u)\n", o->object.type);
2337 break;
0284adc6 2338 }
7560fffc 2339
d89c8fdf
ZJS
2340 if (o->object.flags & OBJECT_COMPRESSION_MASK)
2341 printf("Flags: %s\n",
2342 object_compressed_to_string(o->object.flags & OBJECT_COMPRESSION_MASK));
7560fffc 2343
0284adc6
LP
2344 if (p == le64toh(f->header->tail_object_offset))
2345 p = 0;
2346 else
2347 p = p + ALIGN64(le64toh(o->object.size));
2348 }
7560fffc 2349
0284adc6
LP
2350 return;
2351fail:
2352 log_error("File corrupt");
7560fffc
LP
2353}
2354
718fe4b1
ZJS
2355static const char* format_timestamp_safe(char *buf, size_t l, usec_t t) {
2356 const char *x;
2357
2358 x = format_timestamp(buf, l, t);
2359 if (x)
2360 return x;
2361 return " --- ";
2362}
2363
0284adc6 2364void journal_file_print_header(JournalFile *f) {
2765b7bb 2365 char a[33], b[33], c[33], d[33];
ed375beb 2366 char x[FORMAT_TIMESTAMP_MAX], y[FORMAT_TIMESTAMP_MAX], z[FORMAT_TIMESTAMP_MAX];
a1a03e30
LP
2367 struct stat st;
2368 char bytes[FORMAT_BYTES_MAX];
7560fffc
LP
2369
2370 assert(f);
7560fffc 2371
0284adc6
LP
2372 printf("File Path: %s\n"
2373 "File ID: %s\n"
2374 "Machine ID: %s\n"
2375 "Boot ID: %s\n"
2376 "Sequential Number ID: %s\n"
2377 "State: %s\n"
2378 "Compatible Flags:%s%s\n"
d89c8fdf 2379 "Incompatible Flags:%s%s%s\n"
507f22bd
ZJS
2380 "Header size: %"PRIu64"\n"
2381 "Arena size: %"PRIu64"\n"
2382 "Data Hash Table Size: %"PRIu64"\n"
2383 "Field Hash Table Size: %"PRIu64"\n"
0284adc6 2384 "Rotate Suggested: %s\n"
507f22bd
ZJS
2385 "Head Sequential Number: %"PRIu64"\n"
2386 "Tail Sequential Number: %"PRIu64"\n"
0284adc6 2387 "Head Realtime Timestamp: %s\n"
3223f44f 2388 "Tail Realtime Timestamp: %s\n"
ed375beb 2389 "Tail Monotonic Timestamp: %s\n"
507f22bd
ZJS
2390 "Objects: %"PRIu64"\n"
2391 "Entry Objects: %"PRIu64"\n",
0284adc6
LP
2392 f->path,
2393 sd_id128_to_string(f->header->file_id, a),
2394 sd_id128_to_string(f->header->machine_id, b),
2395 sd_id128_to_string(f->header->boot_id, c),
2765b7bb 2396 sd_id128_to_string(f->header->seqnum_id, d),
3223f44f
LP
2397 f->header->state == STATE_OFFLINE ? "OFFLINE" :
2398 f->header->state == STATE_ONLINE ? "ONLINE" :
2399 f->header->state == STATE_ARCHIVED ? "ARCHIVED" : "UNKNOWN",
8088cbd3 2400 JOURNAL_HEADER_SEALED(f->header) ? " SEALED" : "",
d89c8fdf
ZJS
2401 (le32toh(f->header->compatible_flags) & ~HEADER_COMPATIBLE_ANY) ? " ???" : "",
2402 JOURNAL_HEADER_COMPRESSED_XZ(f->header) ? " COMPRESSED-XZ" : "",
2403 JOURNAL_HEADER_COMPRESSED_LZ4(f->header) ? " COMPRESSED-LZ4" : "",
2404 (le32toh(f->header->incompatible_flags) & ~HEADER_INCOMPATIBLE_ANY) ? " ???" : "",
507f22bd
ZJS
2405 le64toh(f->header->header_size),
2406 le64toh(f->header->arena_size),
2407 le64toh(f->header->data_hash_table_size) / sizeof(HashItem),
2408 le64toh(f->header->field_hash_table_size) / sizeof(HashItem),
fb0951b0 2409 yes_no(journal_file_rotate_suggested(f, 0)),
507f22bd
ZJS
2410 le64toh(f->header->head_entry_seqnum),
2411 le64toh(f->header->tail_entry_seqnum),
718fe4b1
ZJS
2412 format_timestamp_safe(x, sizeof(x), le64toh(f->header->head_entry_realtime)),
2413 format_timestamp_safe(y, sizeof(y), le64toh(f->header->tail_entry_realtime)),
ed375beb 2414 format_timespan(z, sizeof(z), le64toh(f->header->tail_entry_monotonic), USEC_PER_MSEC),
507f22bd
ZJS
2415 le64toh(f->header->n_objects),
2416 le64toh(f->header->n_entries));
7560fffc 2417
0284adc6 2418 if (JOURNAL_HEADER_CONTAINS(f->header, n_data))
507f22bd 2419 printf("Data Objects: %"PRIu64"\n"
0284adc6 2420 "Data Hash Table Fill: %.1f%%\n",
507f22bd 2421 le64toh(f->header->n_data),
0284adc6 2422 100.0 * (double) le64toh(f->header->n_data) / ((double) (le64toh(f->header->data_hash_table_size) / sizeof(HashItem))));
7560fffc 2423
0284adc6 2424 if (JOURNAL_HEADER_CONTAINS(f->header, n_fields))
507f22bd 2425 printf("Field Objects: %"PRIu64"\n"
0284adc6 2426 "Field Hash Table Fill: %.1f%%\n",
507f22bd 2427 le64toh(f->header->n_fields),
0284adc6 2428 100.0 * (double) le64toh(f->header->n_fields) / ((double) (le64toh(f->header->field_hash_table_size) / sizeof(HashItem))));
3223f44f
LP
2429
2430 if (JOURNAL_HEADER_CONTAINS(f->header, n_tags))
507f22bd
ZJS
2431 printf("Tag Objects: %"PRIu64"\n",
2432 le64toh(f->header->n_tags));
3223f44f 2433 if (JOURNAL_HEADER_CONTAINS(f->header, n_entry_arrays))
507f22bd
ZJS
2434 printf("Entry Array Objects: %"PRIu64"\n",
2435 le64toh(f->header->n_entry_arrays));
a1a03e30
LP
2436
2437 if (fstat(f->fd, &st) >= 0)
2438 printf("Disk usage: %s\n", format_bytes(bytes, sizeof(bytes), (off_t) st.st_blocks * 512ULL));
7560fffc
LP
2439}
2440
0284adc6
LP
2441int journal_file_open(
2442 const char *fname,
2443 int flags,
2444 mode_t mode,
2445 bool compress,
baed47c3 2446 bool seal,
0284adc6
LP
2447 JournalMetrics *metrics,
2448 MMapCache *mmap_cache,
2449 JournalFile *template,
2450 JournalFile **ret) {
7560fffc 2451
0284adc6
LP
2452 JournalFile *f;
2453 int r;
2454 bool newly_created = false;
7560fffc 2455
0284adc6 2456 assert(fname);
0559d3a5 2457 assert(ret);
7560fffc 2458
0284adc6
LP
2459 if ((flags & O_ACCMODE) != O_RDONLY &&
2460 (flags & O_ACCMODE) != O_RDWR)
2461 return -EINVAL;
7560fffc 2462
a0108012
LP
2463 if (!endswith(fname, ".journal") &&
2464 !endswith(fname, ".journal~"))
0284adc6 2465 return -EINVAL;
7560fffc 2466
0284adc6
LP
2467 f = new0(JournalFile, 1);
2468 if (!f)
2469 return -ENOMEM;
7560fffc 2470
0284adc6
LP
2471 f->fd = -1;
2472 f->mode = mode;
7560fffc 2473
0284adc6
LP
2474 f->flags = flags;
2475 f->prot = prot_from_flags(flags);
2476 f->writable = (flags & O_ACCMODE) != O_RDONLY;
92261977 2477#if defined(HAVE_LZ4)
d89c8fdf
ZJS
2478 f->compress_lz4 = compress;
2479#elif defined(HAVE_XZ)
2480 f->compress_xz = compress;
48b61739 2481#endif
49a32d43 2482#ifdef HAVE_GCRYPT
baed47c3 2483 f->seal = seal;
49a32d43 2484#endif
7560fffc 2485
0284adc6
LP
2486 if (mmap_cache)
2487 f->mmap = mmap_cache_ref(mmap_cache);
2488 else {
84168d80 2489 f->mmap = mmap_cache_new();
0284adc6
LP
2490 if (!f->mmap) {
2491 r = -ENOMEM;
2492 goto fail;
2493 }
2494 }
7560fffc 2495
0284adc6
LP
2496 f->path = strdup(fname);
2497 if (!f->path) {
2498 r = -ENOMEM;
2499 goto fail;
2500 }
7560fffc 2501
4743015d 2502 f->chain_cache = ordered_hashmap_new(&uint64_hash_ops);
a4bcff5b
LP
2503 if (!f->chain_cache) {
2504 r = -ENOMEM;
2505 goto fail;
2506 }
2507
0284adc6
LP
2508 f->fd = open(f->path, f->flags|O_CLOEXEC, f->mode);
2509 if (f->fd < 0) {
2510 r = -errno;
2511 goto fail;
7560fffc 2512 }
7560fffc 2513
0284adc6
LP
2514 if (fstat(f->fd, &f->last_stat) < 0) {
2515 r = -errno;
2516 goto fail;
2517 }
7560fffc 2518
0284adc6 2519 if (f->last_stat.st_size == 0 && f->writable) {
fb0951b0
LP
2520 uint64_t crtime;
2521
2522 /* Let's attach the creation time to the journal file,
2523 * so that the vacuuming code knows the age of this
2524 * file even if the file might end up corrupted one
2525 * day... Ideally we'd just use the creation time many
2526 * file systems maintain for each file, but there is
2527 * currently no usable API to query this, hence let's
2528 * emulate this via extended attributes. If extended
2529 * attributes are not supported we'll just skip this,
2530 * and rely solely on mtime/atime/ctime of the file.*/
2531
2532 crtime = htole64((uint64_t) now(CLOCK_REALTIME));
2533 fsetxattr(f->fd, "user.crtime_usec", &crtime, sizeof(crtime), XATTR_CREATE);
7560fffc 2534
feb12d3e 2535#ifdef HAVE_GCRYPT
0284adc6 2536 /* Try to load the FSPRG state, and if we can't, then
baed47c3 2537 * just don't do sealing */
49a32d43
LP
2538 if (f->seal) {
2539 r = journal_file_fss_load(f);
2540 if (r < 0)
2541 f->seal = false;
2542 }
feb12d3e 2543#endif
7560fffc 2544
0284adc6
LP
2545 r = journal_file_init_header(f, template);
2546 if (r < 0)
2547 goto fail;
7560fffc 2548
0284adc6
LP
2549 if (fstat(f->fd, &f->last_stat) < 0) {
2550 r = -errno;
2551 goto fail;
2552 }
fb0951b0
LP
2553
2554 newly_created = true;
0284adc6 2555 }
7560fffc 2556
0284adc6
LP
2557 if (f->last_stat.st_size < (off_t) HEADER_SIZE_MIN) {
2558 r = -EIO;
2559 goto fail;
2560 }
7560fffc 2561
0284adc6
LP
2562 f->header = mmap(NULL, PAGE_ALIGN(sizeof(Header)), prot_from_flags(flags), MAP_SHARED, f->fd, 0);
2563 if (f->header == MAP_FAILED) {
2564 f->header = NULL;
2565 r = -errno;
2566 goto fail;
2567 }
7560fffc 2568
0284adc6
LP
2569 if (!newly_created) {
2570 r = journal_file_verify_header(f);
2571 if (r < 0)
2572 goto fail;
2573 }
7560fffc 2574
feb12d3e 2575#ifdef HAVE_GCRYPT
0284adc6 2576 if (!newly_created && f->writable) {
baed47c3 2577 r = journal_file_fss_load(f);
0284adc6
LP
2578 if (r < 0)
2579 goto fail;
2580 }
feb12d3e 2581#endif
cec736d2
LP
2582
2583 if (f->writable) {
4a92baf3
LP
2584 if (metrics) {
2585 journal_default_metrics(metrics, f->fd);
2586 f->metrics = *metrics;
2587 } else if (template)
2588 f->metrics = template->metrics;
2589
cec736d2
LP
2590 r = journal_file_refresh_header(f);
2591 if (r < 0)
2592 goto fail;
2593 }
2594
feb12d3e 2595#ifdef HAVE_GCRYPT
baed47c3 2596 r = journal_file_hmac_setup(f);
14d10188
LP
2597 if (r < 0)
2598 goto fail;
feb12d3e 2599#endif
14d10188 2600
cec736d2 2601 if (newly_created) {
de190aef 2602 r = journal_file_setup_field_hash_table(f);
cec736d2
LP
2603 if (r < 0)
2604 goto fail;
2605
de190aef 2606 r = journal_file_setup_data_hash_table(f);
cec736d2
LP
2607 if (r < 0)
2608 goto fail;
7560fffc 2609
feb12d3e 2610#ifdef HAVE_GCRYPT
7560fffc
LP
2611 r = journal_file_append_first_tag(f);
2612 if (r < 0)
2613 goto fail;
feb12d3e 2614#endif
cec736d2
LP
2615 }
2616
de190aef 2617 r = journal_file_map_field_hash_table(f);
cec736d2
LP
2618 if (r < 0)
2619 goto fail;
2620
de190aef 2621 r = journal_file_map_data_hash_table(f);
cec736d2
LP
2622 if (r < 0)
2623 goto fail;
2624
0559d3a5 2625 *ret = f;
cec736d2
LP
2626 return 0;
2627
2628fail:
2629 journal_file_close(f);
2630
2631 return r;
2632}
0ac38b70 2633
baed47c3 2634int journal_file_rotate(JournalFile **f, bool compress, bool seal) {
57535f47 2635 _cleanup_free_ char *p = NULL;
0ac38b70
LP
2636 size_t l;
2637 JournalFile *old_file, *new_file = NULL;
2638 int r;
2639
2640 assert(f);
2641 assert(*f);
2642
2643 old_file = *f;
2644
2645 if (!old_file->writable)
2646 return -EINVAL;
2647
2648 if (!endswith(old_file->path, ".journal"))
2649 return -EINVAL;
2650
2651 l = strlen(old_file->path);
57535f47
ZJS
2652 r = asprintf(&p, "%.*s@" SD_ID128_FORMAT_STR "-%016"PRIx64"-%016"PRIx64".journal",
2653 (int) l - 8, old_file->path,
2654 SD_ID128_FORMAT_VAL(old_file->header->seqnum_id),
2655 le64toh((*f)->header->head_entry_seqnum),
2656 le64toh((*f)->header->head_entry_realtime));
2657 if (r < 0)
0ac38b70
LP
2658 return -ENOMEM;
2659
0ac38b70 2660 r = rename(old_file->path, p);
0ac38b70
LP
2661 if (r < 0)
2662 return -errno;
2663
ccdbaf91 2664 old_file->header->state = STATE_ARCHIVED;
0ac38b70 2665
baed47c3 2666 r = journal_file_open(old_file->path, old_file->flags, old_file->mode, compress, seal, NULL, old_file->mmap, old_file, &new_file);
0ac38b70
LP
2667 journal_file_close(old_file);
2668
2669 *f = new_file;
2670 return r;
2671}
2672
9447a7f1
LP
2673int journal_file_open_reliably(
2674 const char *fname,
2675 int flags,
2676 mode_t mode,
7560fffc 2677 bool compress,
baed47c3 2678 bool seal,
4a92baf3 2679 JournalMetrics *metrics,
27370278 2680 MMapCache *mmap_cache,
9447a7f1
LP
2681 JournalFile *template,
2682 JournalFile **ret) {
2683
2684 int r;
2685 size_t l;
ed375beb 2686 _cleanup_free_ char *p = NULL;
9447a7f1 2687
baed47c3 2688 r = journal_file_open(fname, flags, mode, compress, seal,
27370278 2689 metrics, mmap_cache, template, ret);
0071d9f1
LP
2690 if (r != -EBADMSG && /* corrupted */
2691 r != -ENODATA && /* truncated */
2692 r != -EHOSTDOWN && /* other machine */
a1a1898f
LP
2693 r != -EPROTONOSUPPORT && /* incompatible feature */
2694 r != -EBUSY && /* unclean shutdown */
2695 r != -ESHUTDOWN /* already archived */)
9447a7f1
LP
2696 return r;
2697
2698 if ((flags & O_ACCMODE) == O_RDONLY)
2699 return r;
2700
2701 if (!(flags & O_CREAT))
2702 return r;
2703
7560fffc
LP
2704 if (!endswith(fname, ".journal"))
2705 return r;
2706
5c70eab4
LP
2707 /* The file is corrupted. Rotate it away and try it again (but only once) */
2708
9447a7f1 2709 l = strlen(fname);
9bf3b535 2710 if (asprintf(&p, "%.*s@%016llx-%016" PRIx64 ".journal~",
57535f47 2711 (int) l - 8, fname,
9447a7f1 2712 (unsigned long long) now(CLOCK_REALTIME),
9bf3b535 2713 random_u64()) < 0)
9447a7f1
LP
2714 return -ENOMEM;
2715
2716 r = rename(fname, p);
9447a7f1
LP
2717 if (r < 0)
2718 return -errno;
2719
a1a1898f 2720 log_warning("File %s corrupted or uncleanly shut down, renaming and replacing.", fname);
9447a7f1 2721
baed47c3 2722 return journal_file_open(fname, flags, mode, compress, seal,
27370278 2723 metrics, mmap_cache, template, ret);
9447a7f1
LP
2724}
2725
cf244689
LP
2726int journal_file_copy_entry(JournalFile *from, JournalFile *to, Object *o, uint64_t p, uint64_t *seqnum, Object **ret, uint64_t *offset) {
2727 uint64_t i, n;
2728 uint64_t q, xor_hash = 0;
2729 int r;
2730 EntryItem *items;
2731 dual_timestamp ts;
2732
2733 assert(from);
2734 assert(to);
2735 assert(o);
2736 assert(p);
2737
2738 if (!to->writable)
2739 return -EPERM;
2740
2741 ts.monotonic = le64toh(o->entry.monotonic);
2742 ts.realtime = le64toh(o->entry.realtime);
2743
cf244689 2744 n = journal_file_entry_n_items(o);
4faa7004
TA
2745 /* alloca() can't take 0, hence let's allocate at least one */
2746 items = alloca(sizeof(EntryItem) * MAX(1u, n));
cf244689
LP
2747
2748 for (i = 0; i < n; i++) {
4fd052ae
FC
2749 uint64_t l, h;
2750 le64_t le_hash;
cf244689
LP
2751 size_t t;
2752 void *data;
2753 Object *u;
2754
2755 q = le64toh(o->entry.items[i].object_offset);
2756 le_hash = o->entry.items[i].hash;
2757
2758 r = journal_file_move_to_object(from, OBJECT_DATA, q, &o);
2759 if (r < 0)
2760 return r;
2761
2762 if (le_hash != o->data.hash)
2763 return -EBADMSG;
2764
2765 l = le64toh(o->object.size) - offsetof(Object, data.payload);
2766 t = (size_t) l;
2767
2768 /* We hit the limit on 32bit machines */
2769 if ((uint64_t) t != l)
2770 return -E2BIG;
2771
d89c8fdf 2772 if (o->object.flags & OBJECT_COMPRESSION_MASK) {
3b1a55e1 2773#if defined(HAVE_XZ) || defined(HAVE_LZ4)
fa1c4b51 2774 size_t rsize;
cf244689 2775
d89c8fdf
ZJS
2776 r = decompress_blob(o->object.flags & OBJECT_COMPRESSION_MASK,
2777 o->data.payload, l, &from->compress_buffer, &from->compress_buffer_size, &rsize, 0);
2778 if (r < 0)
2779 return r;
cf244689
LP
2780
2781 data = from->compress_buffer;
2782 l = rsize;
3b1a55e1
ZJS
2783#else
2784 return -EPROTONOSUPPORT;
2785#endif
cf244689
LP
2786 } else
2787 data = o->data.payload;
2788
2789 r = journal_file_append_data(to, data, l, &u, &h);
2790 if (r < 0)
2791 return r;
2792
2793 xor_hash ^= le64toh(u->data.hash);
2794 items[i].object_offset = htole64(h);
2795 items[i].hash = u->data.hash;
2796
2797 r = journal_file_move_to_object(from, OBJECT_ENTRY, p, &o);
2798 if (r < 0)
2799 return r;
2800 }
2801
2802 return journal_file_append_entry_internal(to, &ts, xor_hash, items, n, seqnum, ret, offset);
2803}
babfc091
LP
2804
2805void journal_default_metrics(JournalMetrics *m, int fd) {
2806 uint64_t fs_size = 0;
2807 struct statvfs ss;
a7bc2c2a 2808 char a[FORMAT_BYTES_MAX], b[FORMAT_BYTES_MAX], c[FORMAT_BYTES_MAX], d[FORMAT_BYTES_MAX];
babfc091
LP
2809
2810 assert(m);
2811 assert(fd >= 0);
2812
2813 if (fstatvfs(fd, &ss) >= 0)
2814 fs_size = ss.f_frsize * ss.f_blocks;
2815
2816 if (m->max_use == (uint64_t) -1) {
2817
2818 if (fs_size > 0) {
2819 m->max_use = PAGE_ALIGN(fs_size / 10); /* 10% of file system size */
2820
2821 if (m->max_use > DEFAULT_MAX_USE_UPPER)
2822 m->max_use = DEFAULT_MAX_USE_UPPER;
2823
2824 if (m->max_use < DEFAULT_MAX_USE_LOWER)
2825 m->max_use = DEFAULT_MAX_USE_LOWER;
2826 } else
2827 m->max_use = DEFAULT_MAX_USE_LOWER;
2828 } else {
2829 m->max_use = PAGE_ALIGN(m->max_use);
2830
2831 if (m->max_use < JOURNAL_FILE_SIZE_MIN*2)
2832 m->max_use = JOURNAL_FILE_SIZE_MIN*2;
2833 }
2834
2835 if (m->max_size == (uint64_t) -1) {
2836 m->max_size = PAGE_ALIGN(m->max_use / 8); /* 8 chunks */
2837
2838 if (m->max_size > DEFAULT_MAX_SIZE_UPPER)
2839 m->max_size = DEFAULT_MAX_SIZE_UPPER;
2840 } else
2841 m->max_size = PAGE_ALIGN(m->max_size);
2842
2843 if (m->max_size < JOURNAL_FILE_SIZE_MIN)
2844 m->max_size = JOURNAL_FILE_SIZE_MIN;
2845
2846 if (m->max_size*2 > m->max_use)
2847 m->max_use = m->max_size*2;
2848
2849 if (m->min_size == (uint64_t) -1)
2850 m->min_size = JOURNAL_FILE_SIZE_MIN;
2851 else {
2852 m->min_size = PAGE_ALIGN(m->min_size);
2853
2854 if (m->min_size < JOURNAL_FILE_SIZE_MIN)
2855 m->min_size = JOURNAL_FILE_SIZE_MIN;
2856
2857 if (m->min_size > m->max_size)
2858 m->max_size = m->min_size;
2859 }
2860
2861 if (m->keep_free == (uint64_t) -1) {
2862
2863 if (fs_size > 0) {
8621b110 2864 m->keep_free = PAGE_ALIGN(fs_size * 3 / 20); /* 15% of file system size */
babfc091
LP
2865
2866 if (m->keep_free > DEFAULT_KEEP_FREE_UPPER)
2867 m->keep_free = DEFAULT_KEEP_FREE_UPPER;
2868
2869 } else
2870 m->keep_free = DEFAULT_KEEP_FREE;
2871 }
2872
2b43f939
LP
2873 log_debug("Fixed max_use=%s max_size=%s min_size=%s keep_free=%s",
2874 format_bytes(a, sizeof(a), m->max_use),
2875 format_bytes(b, sizeof(b), m->max_size),
2876 format_bytes(c, sizeof(c), m->min_size),
2877 format_bytes(d, sizeof(d), m->keep_free));
babfc091 2878}
08984293
LP
2879
2880int journal_file_get_cutoff_realtime_usec(JournalFile *f, usec_t *from, usec_t *to) {
08984293
LP
2881 assert(f);
2882 assert(from || to);
2883
2884 if (from) {
162566a4
LP
2885 if (f->header->head_entry_realtime == 0)
2886 return -ENOENT;
08984293 2887
162566a4 2888 *from = le64toh(f->header->head_entry_realtime);
08984293
LP
2889 }
2890
2891 if (to) {
162566a4
LP
2892 if (f->header->tail_entry_realtime == 0)
2893 return -ENOENT;
08984293 2894
162566a4 2895 *to = le64toh(f->header->tail_entry_realtime);
08984293
LP
2896 }
2897
2898 return 1;
2899}
2900
2901int journal_file_get_cutoff_monotonic_usec(JournalFile *f, sd_id128_t boot_id, usec_t *from, usec_t *to) {
08984293
LP
2902 Object *o;
2903 uint64_t p;
2904 int r;
2905
2906 assert(f);
2907 assert(from || to);
2908
47838ab3 2909 r = find_data_object_by_boot_id(f, boot_id, &o, &p);
08984293
LP
2910 if (r <= 0)
2911 return r;
2912
2913 if (le64toh(o->data.n_entries) <= 0)
2914 return 0;
2915
2916 if (from) {
2917 r = journal_file_move_to_object(f, OBJECT_ENTRY, le64toh(o->data.entry_offset), &o);
2918 if (r < 0)
2919 return r;
2920
2921 *from = le64toh(o->entry.monotonic);
2922 }
2923
2924 if (to) {
2925 r = journal_file_move_to_object(f, OBJECT_DATA, p, &o);
2926 if (r < 0)
2927 return r;
2928
2929 r = generic_array_get_plus_one(f,
2930 le64toh(o->data.entry_offset),
2931 le64toh(o->data.entry_array_offset),
2932 le64toh(o->data.n_entries)-1,
2933 &o, NULL);
2934 if (r <= 0)
2935 return r;
2936
2937 *to = le64toh(o->entry.monotonic);
2938 }
2939
2940 return 1;
2941}
dca6219e 2942
fb0951b0 2943bool journal_file_rotate_suggested(JournalFile *f, usec_t max_file_usec) {
dca6219e
LP
2944 assert(f);
2945
2946 /* If we gained new header fields we gained new features,
2947 * hence suggest a rotation */
361f9cbc
LP
2948 if (le64toh(f->header->header_size) < sizeof(Header)) {
2949 log_debug("%s uses an outdated header, suggesting rotation.", f->path);
dca6219e 2950 return true;
361f9cbc 2951 }
dca6219e
LP
2952
2953 /* Let's check if the hash tables grew over a certain fill
2954 * level (75%, borrowing this value from Java's hash table
2955 * implementation), and if so suggest a rotation. To calculate
2956 * the fill level we need the n_data field, which only exists
2957 * in newer versions. */
2958
2959 if (JOURNAL_HEADER_CONTAINS(f->header, n_data))
361f9cbc 2960 if (le64toh(f->header->n_data) * 4ULL > (le64toh(f->header->data_hash_table_size) / sizeof(HashItem)) * 3ULL) {
507f22bd 2961 log_debug("Data hash table of %s has a fill level at %.1f (%"PRIu64" of %"PRIu64" items, %llu file size, %"PRIu64" bytes per hash table item), suggesting rotation.",
361f9cbc
LP
2962 f->path,
2963 100.0 * (double) le64toh(f->header->n_data) / ((double) (le64toh(f->header->data_hash_table_size) / sizeof(HashItem))),
507f22bd
ZJS
2964 le64toh(f->header->n_data),
2965 le64toh(f->header->data_hash_table_size) / sizeof(HashItem),
2966 (unsigned long long) f->last_stat.st_size,
2967 f->last_stat.st_size / le64toh(f->header->n_data));
dca6219e 2968 return true;
361f9cbc 2969 }
dca6219e
LP
2970
2971 if (JOURNAL_HEADER_CONTAINS(f->header, n_fields))
361f9cbc 2972 if (le64toh(f->header->n_fields) * 4ULL > (le64toh(f->header->field_hash_table_size) / sizeof(HashItem)) * 3ULL) {
507f22bd 2973 log_debug("Field hash table of %s has a fill level at %.1f (%"PRIu64" of %"PRIu64" items), suggesting rotation.",
361f9cbc
LP
2974 f->path,
2975 100.0 * (double) le64toh(f->header->n_fields) / ((double) (le64toh(f->header->field_hash_table_size) / sizeof(HashItem))),
507f22bd
ZJS
2976 le64toh(f->header->n_fields),
2977 le64toh(f->header->field_hash_table_size) / sizeof(HashItem));
dca6219e 2978 return true;
361f9cbc 2979 }
dca6219e 2980
0598fd4a
LP
2981 /* Are the data objects properly indexed by field objects? */
2982 if (JOURNAL_HEADER_CONTAINS(f->header, n_data) &&
2983 JOURNAL_HEADER_CONTAINS(f->header, n_fields) &&
2984 le64toh(f->header->n_data) > 0 &&
2985 le64toh(f->header->n_fields) == 0)
2986 return true;
2987
fb0951b0
LP
2988 if (max_file_usec > 0) {
2989 usec_t t, h;
2990
2991 h = le64toh(f->header->head_entry_realtime);
2992 t = now(CLOCK_REALTIME);
2993
2994 if (h > 0 && t > h + max_file_usec)
2995 return true;
2996 }
2997
dca6219e
LP
2998 return false;
2999}