]> git.ipfire.org Git - thirdparty/systemd.git/blame - src/journal/journal-file.c
journal: delete unused function mmap_cache_close_context
[thirdparty/systemd.git] / src / journal / journal-file.c
CommitLineData
cec736d2
LP
1/*-*- Mode: C; c-basic-offset: 8; indent-tabs-mode: nil -*-*/
2
3/***
4 This file is part of systemd.
5
6 Copyright 2011 Lennart Poettering
7
8 systemd is free software; you can redistribute it and/or modify it
5430f7f2
LP
9 under the terms of the GNU Lesser General Public License as published by
10 the Free Software Foundation; either version 2.1 of the License, or
cec736d2
LP
11 (at your option) any later version.
12
13 systemd is distributed in the hope that it will be useful, but
14 WITHOUT ANY WARRANTY; without even the implied warranty of
15 MERCHANTABILITY or FITNESS FOR A PARTICULAR PURPOSE. See the GNU
5430f7f2 16 Lesser General Public License for more details.
cec736d2 17
5430f7f2 18 You should have received a copy of the GNU Lesser General Public License
cec736d2
LP
19 along with systemd; If not, see <http://www.gnu.org/licenses/>.
20***/
21
22#include <sys/mman.h>
23#include <errno.h>
24#include <sys/uio.h>
25#include <unistd.h>
26#include <sys/statvfs.h>
27#include <fcntl.h>
28#include <stddef.h>
d2edfae0 29#include <sys/xattr.h>
fb0951b0 30
cec736d2
LP
31#include "journal-def.h"
32#include "journal-file.h"
0284adc6 33#include "journal-authenticate.h"
cec736d2 34#include "lookup3.h"
807e17f0 35#include "compress.h"
7560fffc 36#include "fsprg.h"
cec736d2 37
4a92baf3
LP
38#define DEFAULT_DATA_HASH_TABLE_SIZE (2047ULL*sizeof(HashItem))
39#define DEFAULT_FIELD_HASH_TABLE_SIZE (333ULL*sizeof(HashItem))
cec736d2 40
be19b7df 41#define COMPRESSION_SIZE_THRESHOLD (512ULL)
807e17f0 42
babfc091 43/* This is the minimum journal file size */
253f59df 44#define JOURNAL_FILE_SIZE_MIN (4ULL*1024ULL*1024ULL) /* 4 MiB */
babfc091
LP
45
46/* These are the lower and upper bounds if we deduce the max_use value
47 * from the file system size */
48#define DEFAULT_MAX_USE_LOWER (1ULL*1024ULL*1024ULL) /* 1 MiB */
49#define DEFAULT_MAX_USE_UPPER (4ULL*1024ULL*1024ULL*1024ULL) /* 4 GiB */
50
51/* This is the upper bound if we deduce max_size from max_use */
71100051 52#define DEFAULT_MAX_SIZE_UPPER (128ULL*1024ULL*1024ULL) /* 128 MiB */
babfc091
LP
53
54/* This is the upper bound if we deduce the keep_free value from the
55 * file system size */
56#define DEFAULT_KEEP_FREE_UPPER (4ULL*1024ULL*1024ULL*1024ULL) /* 4 GiB */
57
58/* This is the keep_free value when we can't determine the system
59 * size */
60#define DEFAULT_KEEP_FREE (1024ULL*1024ULL) /* 1 MB */
61
dca6219e
LP
62/* n_data was the first entry we added after the initial file format design */
63#define HEADER_SIZE_MIN ALIGN64(offsetof(Header, n_data))
cec736d2 64
a4bcff5b
LP
65/* How many entries to keep in the entry array chain cache at max */
66#define CHAIN_CACHE_MAX 20
67
a676e665
LP
68/* How much to increase the journal file size at once each time we allocate something new. */
69#define FILE_SIZE_INCREASE (8ULL*1024ULL*1024ULL) /* 8MB */
70
9588bc32 71static int journal_file_set_online(JournalFile *f) {
26687bf8
OS
72 assert(f);
73
74 if (!f->writable)
75 return -EPERM;
76
77 if (!(f->fd >= 0 && f->header))
78 return -EINVAL;
79
80 switch(f->header->state) {
81 case STATE_ONLINE:
82 return 0;
83
84 case STATE_OFFLINE:
85 f->header->state = STATE_ONLINE;
86 fsync(f->fd);
87 return 0;
88
89 default:
90 return -EINVAL;
91 }
92}
93
94int journal_file_set_offline(JournalFile *f) {
95 assert(f);
96
97 if (!f->writable)
98 return -EPERM;
99
100 if (!(f->fd >= 0 && f->header))
101 return -EINVAL;
102
103 if (f->header->state != STATE_ONLINE)
104 return 0;
105
106 fsync(f->fd);
107
108 f->header->state = STATE_OFFLINE;
109
110 fsync(f->fd);
111
112 return 0;
113}
114
cec736d2 115void journal_file_close(JournalFile *f) {
de190aef 116 assert(f);
cec736d2 117
feb12d3e 118#ifdef HAVE_GCRYPT
b0af6f41 119 /* Write the final tag */
c586dbf1 120 if (f->seal && f->writable)
b0af6f41 121 journal_file_append_tag(f);
feb12d3e 122#endif
b0af6f41 123
7560fffc 124 /* Sync everything to disk, before we mark the file offline */
16e9f408
LP
125 if (f->mmap && f->fd >= 0)
126 mmap_cache_close_fd(f->mmap, f->fd);
7560fffc 127
26687bf8 128 journal_file_set_offline(f);
cec736d2 129
26687bf8 130 if (f->header)
d384c7a8 131 munmap(f->header, PAGE_ALIGN(sizeof(Header)));
cec736d2 132
03e334a1 133 safe_close(f->fd);
cec736d2 134 free(f->path);
807e17f0 135
16e9f408
LP
136 if (f->mmap)
137 mmap_cache_unref(f->mmap);
138
4743015d 139 ordered_hashmap_free_free(f->chain_cache);
a4bcff5b 140
d89c8fdf 141#if defined(HAVE_XZ) || defined(HAVE_LZ4)
807e17f0
LP
142 free(f->compress_buffer);
143#endif
144
7560fffc 145#ifdef HAVE_GCRYPT
baed47c3
LP
146 if (f->fss_file)
147 munmap(f->fss_file, PAGE_ALIGN(f->fss_file_size));
b7c9ae91
LP
148 else if (f->fsprg_state)
149 free(f->fsprg_state);
150
151 free(f->fsprg_seed);
7560fffc
LP
152
153 if (f->hmac)
154 gcry_md_close(f->hmac);
155#endif
156
cec736d2
LP
157 free(f);
158}
159
0ac38b70 160static int journal_file_init_header(JournalFile *f, JournalFile *template) {
d89c8fdf 161 Header h = {};
cec736d2
LP
162 ssize_t k;
163 int r;
164
165 assert(f);
166
7560fffc 167 memcpy(h.signature, HEADER_SIGNATURE, 8);
23b0b2b2 168 h.header_size = htole64(ALIGN64(sizeof(h)));
cec736d2 169
d89c8fdf
ZJS
170 h.incompatible_flags |= htole32(
171 f->compress_xz * HEADER_INCOMPATIBLE_COMPRESSED_XZ |
172 f->compress_lz4 * HEADER_INCOMPATIBLE_COMPRESSED_LZ4);
7560fffc 173
d89c8fdf
ZJS
174 h.compatible_flags = htole32(
175 f->seal * HEADER_COMPATIBLE_SEALED);
7560fffc 176
cec736d2
LP
177 r = sd_id128_randomize(&h.file_id);
178 if (r < 0)
179 return r;
180
0ac38b70
LP
181 if (template) {
182 h.seqnum_id = template->header->seqnum_id;
beec0085 183 h.tail_entry_seqnum = template->header->tail_entry_seqnum;
0ac38b70
LP
184 } else
185 h.seqnum_id = h.file_id;
cec736d2
LP
186
187 k = pwrite(f->fd, &h, sizeof(h), 0);
188 if (k < 0)
189 return -errno;
190
191 if (k != sizeof(h))
192 return -EIO;
193
194 return 0;
195}
196
197static int journal_file_refresh_header(JournalFile *f) {
198 int r;
de190aef 199 sd_id128_t boot_id;
cec736d2
LP
200
201 assert(f);
202
203 r = sd_id128_get_machine(&f->header->machine_id);
204 if (r < 0)
205 return r;
206
de190aef 207 r = sd_id128_get_boot(&boot_id);
cec736d2
LP
208 if (r < 0)
209 return r;
210
de190aef
LP
211 if (sd_id128_equal(boot_id, f->header->boot_id))
212 f->tail_entry_monotonic_valid = true;
213
214 f->header->boot_id = boot_id;
215
26687bf8 216 journal_file_set_online(f);
b788cc23 217
7560fffc 218 /* Sync the online state to disk */
a676e665 219 fsync(f->fd);
b788cc23 220
cec736d2
LP
221 return 0;
222}
223
224static int journal_file_verify_header(JournalFile *f) {
d89c8fdf
ZJS
225 uint32_t flags;
226
cec736d2
LP
227 assert(f);
228
7560fffc 229 if (memcmp(f->header->signature, HEADER_SIGNATURE, 8))
cec736d2
LP
230 return -EBADMSG;
231
7560fffc
LP
232 /* In both read and write mode we refuse to open files with
233 * incompatible flags we don't know */
d89c8fdf
ZJS
234 flags = le32toh(f->header->incompatible_flags);
235 if (flags & ~HEADER_INCOMPATIBLE_SUPPORTED) {
236 if (flags & ~HEADER_INCOMPATIBLE_ANY)
237 log_debug("Journal file %s has unknown incompatible flags %"PRIx32,
238 f->path, flags & ~HEADER_INCOMPATIBLE_ANY);
239 flags = (flags & HEADER_INCOMPATIBLE_ANY) & ~HEADER_INCOMPATIBLE_SUPPORTED;
240 if (flags)
241 log_debug("Journal file %s uses incompatible flags %"PRIx32
242 " disabled at compilation time.", f->path, flags);
cec736d2 243 return -EPROTONOSUPPORT;
d89c8fdf 244 }
cec736d2 245
7560fffc
LP
246 /* When open for writing we refuse to open files with
247 * compatible flags, too */
d89c8fdf
ZJS
248 flags = le32toh(f->header->compatible_flags);
249 if (f->writable && (flags & ~HEADER_COMPATIBLE_SUPPORTED)) {
250 if (flags & ~HEADER_COMPATIBLE_ANY)
251 log_debug("Journal file %s has unknown compatible flags %"PRIx32,
252 f->path, flags & ~HEADER_COMPATIBLE_ANY);
253 flags = (flags & HEADER_COMPATIBLE_ANY) & ~HEADER_COMPATIBLE_SUPPORTED;
254 if (flags)
255 log_debug("Journal file %s uses compatible flags %"PRIx32
256 " disabled at compilation time.", f->path, flags);
257 return -EPROTONOSUPPORT;
7560fffc
LP
258 }
259
db11ac1a
LP
260 if (f->header->state >= _STATE_MAX)
261 return -EBADMSG;
262
dca6219e
LP
263 /* The first addition was n_data, so check that we are at least this large */
264 if (le64toh(f->header->header_size) < HEADER_SIZE_MIN)
23b0b2b2
LP
265 return -EBADMSG;
266
8088cbd3 267 if (JOURNAL_HEADER_SEALED(f->header) && !JOURNAL_HEADER_CONTAINS(f->header, n_entry_arrays))
beec0085
LP
268 return -EBADMSG;
269
db11ac1a
LP
270 if ((le64toh(f->header->header_size) + le64toh(f->header->arena_size)) > (uint64_t) f->last_stat.st_size)
271 return -ENODATA;
272
273 if (le64toh(f->header->tail_object_offset) > (le64toh(f->header->header_size) + le64toh(f->header->arena_size)))
274 return -ENODATA;
275
7762e02b
LP
276 if (!VALID64(le64toh(f->header->data_hash_table_offset)) ||
277 !VALID64(le64toh(f->header->field_hash_table_offset)) ||
278 !VALID64(le64toh(f->header->tail_object_offset)) ||
279 !VALID64(le64toh(f->header->entry_array_offset)))
280 return -ENODATA;
281
cec736d2 282 if (f->writable) {
ccdbaf91 283 uint8_t state;
cec736d2
LP
284 sd_id128_t machine_id;
285 int r;
286
287 r = sd_id128_get_machine(&machine_id);
288 if (r < 0)
289 return r;
290
291 if (!sd_id128_equal(machine_id, f->header->machine_id))
292 return -EHOSTDOWN;
293
de190aef 294 state = f->header->state;
cec736d2 295
71fa6f00
LP
296 if (state == STATE_ONLINE) {
297 log_debug("Journal file %s is already online. Assuming unclean closing.", f->path);
298 return -EBUSY;
299 } else if (state == STATE_ARCHIVED)
cec736d2 300 return -ESHUTDOWN;
71fa6f00
LP
301 else if (state != STATE_OFFLINE) {
302 log_debug("Journal file %s has unknown state %u.", f->path, state);
303 return -EBUSY;
304 }
cec736d2
LP
305 }
306
d89c8fdf
ZJS
307 f->compress_xz = JOURNAL_HEADER_COMPRESSED_XZ(f->header);
308 f->compress_lz4 = JOURNAL_HEADER_COMPRESSED_LZ4(f->header);
c586dbf1 309
f1889c91 310 f->seal = JOURNAL_HEADER_SEALED(f->header);
7560fffc 311
cec736d2
LP
312 return 0;
313}
314
315static int journal_file_allocate(JournalFile *f, uint64_t offset, uint64_t size) {
eda4b58b 316 uint64_t old_size, new_size;
fec2aa2f 317 int r;
cec736d2
LP
318
319 assert(f);
320
cec736d2 321 /* We assume that this file is not sparse, and we know that
38ac38b2 322 * for sure, since we always call posix_fallocate()
cec736d2
LP
323 * ourselves */
324
325 old_size =
23b0b2b2 326 le64toh(f->header->header_size) +
cec736d2
LP
327 le64toh(f->header->arena_size);
328
bc85bfee 329 new_size = PAGE_ALIGN(offset + size);
23b0b2b2
LP
330 if (new_size < le64toh(f->header->header_size))
331 new_size = le64toh(f->header->header_size);
bc85bfee
LP
332
333 if (new_size <= old_size)
cec736d2
LP
334 return 0;
335
a676e665 336 if (f->metrics.max_size > 0 && new_size > f->metrics.max_size)
bc85bfee 337 return -E2BIG;
cec736d2 338
a676e665 339 if (new_size > f->metrics.min_size && f->metrics.keep_free > 0) {
cec736d2
LP
340 struct statvfs svfs;
341
342 if (fstatvfs(f->fd, &svfs) >= 0) {
343 uint64_t available;
344
345 available = svfs.f_bfree * svfs.f_bsize;
346
bc85bfee
LP
347 if (available >= f->metrics.keep_free)
348 available -= f->metrics.keep_free;
cec736d2
LP
349 else
350 available = 0;
351
352 if (new_size - old_size > available)
353 return -E2BIG;
354 }
355 }
356
eda4b58b
LP
357 /* Increase by larger blocks at once */
358 new_size = ((new_size+FILE_SIZE_INCREASE-1) / FILE_SIZE_INCREASE) * FILE_SIZE_INCREASE;
359 if (f->metrics.max_size > 0 && new_size > f->metrics.max_size)
360 new_size = f->metrics.max_size;
361
bc85bfee
LP
362 /* Note that the glibc fallocate() fallback is very
363 inefficient, hence we try to minimize the allocation area
364 as we can. */
fec2aa2f
GV
365 r = posix_fallocate(f->fd, old_size, new_size - old_size);
366 if (r != 0)
367 return -r;
cec736d2 368
eda4b58b
LP
369 if (fstat(f->fd, &f->last_stat) < 0)
370 return -errno;
cec736d2 371
23b0b2b2 372 f->header->arena_size = htole64(new_size - le64toh(f->header->header_size));
cec736d2
LP
373
374 return 0;
375}
376
78519831 377static unsigned type_to_context(ObjectType type) {
d3d3208f 378 /* One context for each type, plus one catch-all for the rest */
d05089d8 379 return type > OBJECT_UNUSED && type < _OBJECT_TYPE_MAX ? type : 0;
d3d3208f
MS
380}
381
7a9dabea 382static int journal_file_move_to(JournalFile *f, ObjectType type, bool keep_always, uint64_t offset, uint64_t size, void **ret) {
cec736d2 383 assert(f);
cec736d2
LP
384 assert(ret);
385
7762e02b
LP
386 if (size <= 0)
387 return -EINVAL;
388
2a59ea54 389 /* Avoid SIGBUS on invalid accesses */
4bbdcdb3
LP
390 if (offset + size > (uint64_t) f->last_stat.st_size) {
391 /* Hmm, out of range? Let's refresh the fstat() data
392 * first, before we trust that check. */
393
394 if (fstat(f->fd, &f->last_stat) < 0 ||
395 offset + size > (uint64_t) f->last_stat.st_size)
396 return -EADDRNOTAVAIL;
397 }
398
7a9dabea 399 return mmap_cache_get(f->mmap, f->fd, f->prot, type_to_context(type), keep_always, offset, size, &f->last_stat, ret);
cec736d2
LP
400}
401
16e9f408
LP
402static uint64_t minimum_header_size(Object *o) {
403
b8e891e6 404 static const uint64_t table[] = {
16e9f408
LP
405 [OBJECT_DATA] = sizeof(DataObject),
406 [OBJECT_FIELD] = sizeof(FieldObject),
407 [OBJECT_ENTRY] = sizeof(EntryObject),
408 [OBJECT_DATA_HASH_TABLE] = sizeof(HashTableObject),
409 [OBJECT_FIELD_HASH_TABLE] = sizeof(HashTableObject),
410 [OBJECT_ENTRY_ARRAY] = sizeof(EntryArrayObject),
411 [OBJECT_TAG] = sizeof(TagObject),
412 };
413
414 if (o->object.type >= ELEMENTSOF(table) || table[o->object.type] <= 0)
415 return sizeof(ObjectHeader);
416
417 return table[o->object.type];
418}
419
78519831 420int journal_file_move_to_object(JournalFile *f, ObjectType type, uint64_t offset, Object **ret) {
cec736d2
LP
421 int r;
422 void *t;
423 Object *o;
424 uint64_t s;
425
426 assert(f);
427 assert(ret);
428
db11ac1a
LP
429 /* Objects may only be located at multiple of 64 bit */
430 if (!VALID64(offset))
431 return -EFAULT;
432
7a9dabea 433 r = journal_file_move_to(f, type, false, offset, sizeof(ObjectHeader), &t);
cec736d2
LP
434 if (r < 0)
435 return r;
436
437 o = (Object*) t;
438 s = le64toh(o->object.size);
439
440 if (s < sizeof(ObjectHeader))
441 return -EBADMSG;
442
16e9f408
LP
443 if (o->object.type <= OBJECT_UNUSED)
444 return -EBADMSG;
445
446 if (s < minimum_header_size(o))
447 return -EBADMSG;
448
d05089d8 449 if (type > OBJECT_UNUSED && o->object.type != type)
cec736d2
LP
450 return -EBADMSG;
451
452 if (s > sizeof(ObjectHeader)) {
7a9dabea 453 r = journal_file_move_to(f, type, false, offset, s, &t);
cec736d2
LP
454 if (r < 0)
455 return r;
456
457 o = (Object*) t;
458 }
459
cec736d2
LP
460 *ret = o;
461 return 0;
462}
463
d98cc1f2 464static uint64_t journal_file_entry_seqnum(JournalFile *f, uint64_t *seqnum) {
cec736d2
LP
465 uint64_t r;
466
467 assert(f);
468
beec0085 469 r = le64toh(f->header->tail_entry_seqnum) + 1;
c2373f84
LP
470
471 if (seqnum) {
de190aef 472 /* If an external seqnum counter was passed, we update
c2373f84
LP
473 * both the local and the external one, and set it to
474 * the maximum of both */
475
476 if (*seqnum + 1 > r)
477 r = *seqnum + 1;
478
479 *seqnum = r;
480 }
481
beec0085 482 f->header->tail_entry_seqnum = htole64(r);
cec736d2 483
beec0085
LP
484 if (f->header->head_entry_seqnum == 0)
485 f->header->head_entry_seqnum = htole64(r);
de190aef 486
cec736d2
LP
487 return r;
488}
489
78519831 490int journal_file_append_object(JournalFile *f, ObjectType type, uint64_t size, Object **ret, uint64_t *offset) {
cec736d2
LP
491 int r;
492 uint64_t p;
493 Object *tail, *o;
494 void *t;
495
496 assert(f);
d05089d8 497 assert(type > OBJECT_UNUSED && type < _OBJECT_TYPE_MAX);
cec736d2
LP
498 assert(size >= sizeof(ObjectHeader));
499 assert(offset);
500 assert(ret);
501
26687bf8
OS
502 r = journal_file_set_online(f);
503 if (r < 0)
504 return r;
505
cec736d2 506 p = le64toh(f->header->tail_object_offset);
cec736d2 507 if (p == 0)
23b0b2b2 508 p = le64toh(f->header->header_size);
cec736d2 509 else {
d05089d8 510 r = journal_file_move_to_object(f, OBJECT_UNUSED, p, &tail);
cec736d2
LP
511 if (r < 0)
512 return r;
513
514 p += ALIGN64(le64toh(tail->object.size));
515 }
516
517 r = journal_file_allocate(f, p, size);
518 if (r < 0)
519 return r;
520
fcde2389 521 r = journal_file_move_to(f, type, false, p, size, &t);
cec736d2
LP
522 if (r < 0)
523 return r;
524
525 o = (Object*) t;
526
527 zero(o->object);
de190aef 528 o->object.type = type;
cec736d2
LP
529 o->object.size = htole64(size);
530
531 f->header->tail_object_offset = htole64(p);
cec736d2
LP
532 f->header->n_objects = htole64(le64toh(f->header->n_objects) + 1);
533
534 *ret = o;
535 *offset = p;
536
537 return 0;
538}
539
de190aef 540static int journal_file_setup_data_hash_table(JournalFile *f) {
cec736d2
LP
541 uint64_t s, p;
542 Object *o;
543 int r;
544
545 assert(f);
546
dfabe643 547 /* We estimate that we need 1 hash table entry per 768 of
4a92baf3
LP
548 journal file and we want to make sure we never get beyond
549 75% fill level. Calculate the hash table size for the
550 maximum file size based on these metrics. */
551
dfabe643 552 s = (f->metrics.max_size * 4 / 768 / 3) * sizeof(HashItem);
4a92baf3
LP
553 if (s < DEFAULT_DATA_HASH_TABLE_SIZE)
554 s = DEFAULT_DATA_HASH_TABLE_SIZE;
555
507f22bd 556 log_debug("Reserving %"PRIu64" entries in hash table.", s / sizeof(HashItem));
4a92baf3 557
de190aef
LP
558 r = journal_file_append_object(f,
559 OBJECT_DATA_HASH_TABLE,
560 offsetof(Object, hash_table.items) + s,
561 &o, &p);
cec736d2
LP
562 if (r < 0)
563 return r;
564
29804cc1 565 memzero(o->hash_table.items, s);
cec736d2 566
de190aef
LP
567 f->header->data_hash_table_offset = htole64(p + offsetof(Object, hash_table.items));
568 f->header->data_hash_table_size = htole64(s);
cec736d2
LP
569
570 return 0;
571}
572
de190aef 573static int journal_file_setup_field_hash_table(JournalFile *f) {
cec736d2
LP
574 uint64_t s, p;
575 Object *o;
576 int r;
577
578 assert(f);
579
3c1668da
LP
580 /* We use a fixed size hash table for the fields as this
581 * number should grow very slowly only */
582
de190aef
LP
583 s = DEFAULT_FIELD_HASH_TABLE_SIZE;
584 r = journal_file_append_object(f,
585 OBJECT_FIELD_HASH_TABLE,
586 offsetof(Object, hash_table.items) + s,
587 &o, &p);
cec736d2
LP
588 if (r < 0)
589 return r;
590
29804cc1 591 memzero(o->hash_table.items, s);
cec736d2 592
de190aef
LP
593 f->header->field_hash_table_offset = htole64(p + offsetof(Object, hash_table.items));
594 f->header->field_hash_table_size = htole64(s);
cec736d2
LP
595
596 return 0;
597}
598
de190aef 599static int journal_file_map_data_hash_table(JournalFile *f) {
cec736d2
LP
600 uint64_t s, p;
601 void *t;
602 int r;
603
604 assert(f);
605
de190aef
LP
606 p = le64toh(f->header->data_hash_table_offset);
607 s = le64toh(f->header->data_hash_table_size);
cec736d2 608
de190aef 609 r = journal_file_move_to(f,
16e9f408 610 OBJECT_DATA_HASH_TABLE,
fcde2389 611 true,
de190aef
LP
612 p, s,
613 &t);
cec736d2
LP
614 if (r < 0)
615 return r;
616
de190aef 617 f->data_hash_table = t;
cec736d2
LP
618 return 0;
619}
620
de190aef 621static int journal_file_map_field_hash_table(JournalFile *f) {
cec736d2
LP
622 uint64_t s, p;
623 void *t;
624 int r;
625
626 assert(f);
627
de190aef
LP
628 p = le64toh(f->header->field_hash_table_offset);
629 s = le64toh(f->header->field_hash_table_size);
cec736d2 630
de190aef 631 r = journal_file_move_to(f,
16e9f408 632 OBJECT_FIELD_HASH_TABLE,
fcde2389 633 true,
de190aef
LP
634 p, s,
635 &t);
cec736d2
LP
636 if (r < 0)
637 return r;
638
de190aef 639 f->field_hash_table = t;
cec736d2
LP
640 return 0;
641}
642
3c1668da
LP
643static int journal_file_link_field(
644 JournalFile *f,
645 Object *o,
646 uint64_t offset,
647 uint64_t hash) {
648
649 uint64_t p, h;
650 int r;
651
652 assert(f);
653 assert(o);
654 assert(offset > 0);
655
656 if (o->object.type != OBJECT_FIELD)
657 return -EINVAL;
658
659 /* This might alter the window we are looking at */
660
661 o->field.next_hash_offset = o->field.head_data_offset = 0;
662
663 h = hash % (le64toh(f->header->field_hash_table_size) / sizeof(HashItem));
664 p = le64toh(f->field_hash_table[h].tail_hash_offset);
665 if (p == 0)
666 f->field_hash_table[h].head_hash_offset = htole64(offset);
667 else {
668 r = journal_file_move_to_object(f, OBJECT_FIELD, p, &o);
669 if (r < 0)
670 return r;
671
672 o->field.next_hash_offset = htole64(offset);
673 }
674
675 f->field_hash_table[h].tail_hash_offset = htole64(offset);
676
677 if (JOURNAL_HEADER_CONTAINS(f->header, n_fields))
678 f->header->n_fields = htole64(le64toh(f->header->n_fields) + 1);
679
680 return 0;
681}
682
683static int journal_file_link_data(
684 JournalFile *f,
685 Object *o,
686 uint64_t offset,
687 uint64_t hash) {
688
de190aef 689 uint64_t p, h;
cec736d2
LP
690 int r;
691
692 assert(f);
693 assert(o);
694 assert(offset > 0);
b588975f
LP
695
696 if (o->object.type != OBJECT_DATA)
697 return -EINVAL;
cec736d2 698
48496df6
LP
699 /* This might alter the window we are looking at */
700
de190aef
LP
701 o->data.next_hash_offset = o->data.next_field_offset = 0;
702 o->data.entry_offset = o->data.entry_array_offset = 0;
703 o->data.n_entries = 0;
cec736d2 704
de190aef 705 h = hash % (le64toh(f->header->data_hash_table_size) / sizeof(HashItem));
8db4213e 706 p = le64toh(f->data_hash_table[h].tail_hash_offset);
3c1668da 707 if (p == 0)
cec736d2 708 /* Only entry in the hash table is easy */
de190aef 709 f->data_hash_table[h].head_hash_offset = htole64(offset);
3c1668da 710 else {
48496df6
LP
711 /* Move back to the previous data object, to patch in
712 * pointer */
cec736d2 713
de190aef 714 r = journal_file_move_to_object(f, OBJECT_DATA, p, &o);
cec736d2
LP
715 if (r < 0)
716 return r;
717
de190aef 718 o->data.next_hash_offset = htole64(offset);
cec736d2
LP
719 }
720
de190aef 721 f->data_hash_table[h].tail_hash_offset = htole64(offset);
cec736d2 722
dca6219e
LP
723 if (JOURNAL_HEADER_CONTAINS(f->header, n_data))
724 f->header->n_data = htole64(le64toh(f->header->n_data) + 1);
725
cec736d2
LP
726 return 0;
727}
728
3c1668da
LP
729int journal_file_find_field_object_with_hash(
730 JournalFile *f,
731 const void *field, uint64_t size, uint64_t hash,
732 Object **ret, uint64_t *offset) {
733
734 uint64_t p, osize, h;
735 int r;
736
737 assert(f);
738 assert(field && size > 0);
739
740 osize = offsetof(Object, field.payload) + size;
741
742 if (f->header->field_hash_table_size == 0)
743 return -EBADMSG;
744
745 h = hash % (le64toh(f->header->field_hash_table_size) / sizeof(HashItem));
746 p = le64toh(f->field_hash_table[h].head_hash_offset);
747
748 while (p > 0) {
749 Object *o;
750
751 r = journal_file_move_to_object(f, OBJECT_FIELD, p, &o);
752 if (r < 0)
753 return r;
754
755 if (le64toh(o->field.hash) == hash &&
756 le64toh(o->object.size) == osize &&
757 memcmp(o->field.payload, field, size) == 0) {
758
759 if (ret)
760 *ret = o;
761 if (offset)
762 *offset = p;
763
764 return 1;
765 }
766
767 p = le64toh(o->field.next_hash_offset);
768 }
769
770 return 0;
771}
772
773int journal_file_find_field_object(
774 JournalFile *f,
775 const void *field, uint64_t size,
776 Object **ret, uint64_t *offset) {
777
778 uint64_t hash;
779
780 assert(f);
781 assert(field && size > 0);
782
783 hash = hash64(field, size);
784
785 return journal_file_find_field_object_with_hash(f,
786 field, size, hash,
787 ret, offset);
788}
789
de190aef
LP
790int journal_file_find_data_object_with_hash(
791 JournalFile *f,
792 const void *data, uint64_t size, uint64_t hash,
793 Object **ret, uint64_t *offset) {
48496df6 794
de190aef 795 uint64_t p, osize, h;
cec736d2
LP
796 int r;
797
798 assert(f);
799 assert(data || size == 0);
800
801 osize = offsetof(Object, data.payload) + size;
802
bc85bfee
LP
803 if (f->header->data_hash_table_size == 0)
804 return -EBADMSG;
805
de190aef
LP
806 h = hash % (le64toh(f->header->data_hash_table_size) / sizeof(HashItem));
807 p = le64toh(f->data_hash_table[h].head_hash_offset);
cec736d2 808
de190aef
LP
809 while (p > 0) {
810 Object *o;
cec736d2 811
de190aef 812 r = journal_file_move_to_object(f, OBJECT_DATA, p, &o);
cec736d2
LP
813 if (r < 0)
814 return r;
815
807e17f0 816 if (le64toh(o->data.hash) != hash)
85a131e8 817 goto next;
807e17f0 818
d89c8fdf 819 if (o->object.flags & OBJECT_COMPRESSION_MASK) {
3b1a55e1 820#if defined(HAVE_XZ) || defined(HAVE_LZ4)
fa1c4b51
ZJS
821 uint64_t l;
822 size_t rsize;
cec736d2 823
807e17f0
LP
824 l = le64toh(o->object.size);
825 if (l <= offsetof(Object, data.payload))
cec736d2
LP
826 return -EBADMSG;
827
807e17f0
LP
828 l -= offsetof(Object, data.payload);
829
d89c8fdf
ZJS
830 r = decompress_blob(o->object.flags & OBJECT_COMPRESSION_MASK,
831 o->data.payload, l, &f->compress_buffer, &f->compress_buffer_size, &rsize, 0);
832 if (r < 0)
833 return r;
807e17f0 834
b785c858 835 if (rsize == size &&
807e17f0
LP
836 memcmp(f->compress_buffer, data, size) == 0) {
837
838 if (ret)
839 *ret = o;
840
841 if (offset)
842 *offset = p;
843
844 return 1;
845 }
3b1a55e1
ZJS
846#else
847 return -EPROTONOSUPPORT;
848#endif
807e17f0
LP
849 } else if (le64toh(o->object.size) == osize &&
850 memcmp(o->data.payload, data, size) == 0) {
851
cec736d2
LP
852 if (ret)
853 *ret = o;
854
855 if (offset)
856 *offset = p;
857
de190aef 858 return 1;
cec736d2
LP
859 }
860
85a131e8 861 next:
cec736d2
LP
862 p = le64toh(o->data.next_hash_offset);
863 }
864
de190aef
LP
865 return 0;
866}
867
868int journal_file_find_data_object(
869 JournalFile *f,
870 const void *data, uint64_t size,
871 Object **ret, uint64_t *offset) {
872
873 uint64_t hash;
874
875 assert(f);
876 assert(data || size == 0);
877
878 hash = hash64(data, size);
879
880 return journal_file_find_data_object_with_hash(f,
881 data, size, hash,
882 ret, offset);
883}
884
3c1668da
LP
885static int journal_file_append_field(
886 JournalFile *f,
887 const void *field, uint64_t size,
888 Object **ret, uint64_t *offset) {
889
890 uint64_t hash, p;
891 uint64_t osize;
892 Object *o;
893 int r;
894
895 assert(f);
896 assert(field && size > 0);
897
898 hash = hash64(field, size);
899
900 r = journal_file_find_field_object_with_hash(f, field, size, hash, &o, &p);
901 if (r < 0)
902 return r;
903 else if (r > 0) {
904
905 if (ret)
906 *ret = o;
907
908 if (offset)
909 *offset = p;
910
911 return 0;
912 }
913
914 osize = offsetof(Object, field.payload) + size;
915 r = journal_file_append_object(f, OBJECT_FIELD, osize, &o, &p);
8c92d4bb
LP
916 if (r < 0)
917 return r;
3c1668da
LP
918
919 o->field.hash = htole64(hash);
920 memcpy(o->field.payload, field, size);
921
922 r = journal_file_link_field(f, o, p, hash);
923 if (r < 0)
924 return r;
925
926 /* The linking might have altered the window, so let's
927 * refresh our pointer */
928 r = journal_file_move_to_object(f, OBJECT_FIELD, p, &o);
929 if (r < 0)
930 return r;
931
932#ifdef HAVE_GCRYPT
933 r = journal_file_hmac_put_object(f, OBJECT_FIELD, o, p);
934 if (r < 0)
935 return r;
936#endif
937
938 if (ret)
939 *ret = o;
940
941 if (offset)
942 *offset = p;
943
944 return 0;
945}
946
48496df6
LP
947static int journal_file_append_data(
948 JournalFile *f,
949 const void *data, uint64_t size,
950 Object **ret, uint64_t *offset) {
951
de190aef
LP
952 uint64_t hash, p;
953 uint64_t osize;
954 Object *o;
d89c8fdf 955 int r, compression = 0;
3c1668da 956 const void *eq;
de190aef
LP
957
958 assert(f);
959 assert(data || size == 0);
960
961 hash = hash64(data, size);
962
963 r = journal_file_find_data_object_with_hash(f, data, size, hash, &o, &p);
964 if (r < 0)
965 return r;
966 else if (r > 0) {
967
968 if (ret)
969 *ret = o;
970
971 if (offset)
972 *offset = p;
973
974 return 0;
975 }
976
977 osize = offsetof(Object, data.payload) + size;
978 r = journal_file_append_object(f, OBJECT_DATA, osize, &o, &p);
cec736d2
LP
979 if (r < 0)
980 return r;
981
cec736d2 982 o->data.hash = htole64(hash);
807e17f0 983
d89c8fdf
ZJS
984#if defined(HAVE_XZ) || defined(HAVE_LZ4)
985 if (f->compress_xz &&
807e17f0 986 size >= COMPRESSION_SIZE_THRESHOLD) {
fa1c4b51 987 size_t rsize;
807e17f0 988
d89c8fdf 989 compression = compress_blob(data, size, o->data.payload, &rsize);
807e17f0 990
d89c8fdf 991 if (compression) {
807e17f0 992 o->object.size = htole64(offsetof(Object, data.payload) + rsize);
d89c8fdf 993 o->object.flags |= compression;
807e17f0 994
fa1c4b51 995 log_debug("Compressed data object %"PRIu64" -> %zu using %s",
d89c8fdf 996 size, rsize, object_compressed_to_string(compression));
807e17f0
LP
997 }
998 }
999#endif
1000
d89c8fdf 1001 if (!compression && size > 0)
807e17f0 1002 memcpy(o->data.payload, data, size);
cec736d2 1003
de190aef 1004 r = journal_file_link_data(f, o, p, hash);
cec736d2
LP
1005 if (r < 0)
1006 return r;
1007
48496df6
LP
1008 /* The linking might have altered the window, so let's
1009 * refresh our pointer */
1010 r = journal_file_move_to_object(f, OBJECT_DATA, p, &o);
1011 if (r < 0)
1012 return r;
1013
08c6f819
SL
1014 if (!data)
1015 eq = NULL;
1016 else
1017 eq = memchr(data, '=', size);
3c1668da 1018 if (eq && eq > data) {
748db592 1019 Object *fo = NULL;
3c1668da 1020 uint64_t fp;
3c1668da
LP
1021
1022 /* Create field object ... */
1023 r = journal_file_append_field(f, data, (uint8_t*) eq - (uint8_t*) data, &fo, &fp);
1024 if (r < 0)
1025 return r;
1026
1027 /* ... and link it in. */
1028 o->data.next_field_offset = fo->field.head_data_offset;
1029 fo->field.head_data_offset = le64toh(p);
1030 }
1031
5996c7c2
LP
1032#ifdef HAVE_GCRYPT
1033 r = journal_file_hmac_put_object(f, OBJECT_DATA, o, p);
1034 if (r < 0)
1035 return r;
1036#endif
1037
cec736d2
LP
1038 if (ret)
1039 *ret = o;
1040
1041 if (offset)
de190aef 1042 *offset = p;
cec736d2
LP
1043
1044 return 0;
1045}
1046
1047uint64_t journal_file_entry_n_items(Object *o) {
1048 assert(o);
b588975f
LP
1049
1050 if (o->object.type != OBJECT_ENTRY)
1051 return 0;
cec736d2
LP
1052
1053 return (le64toh(o->object.size) - offsetof(Object, entry.items)) / sizeof(EntryItem);
1054}
1055
0284adc6 1056uint64_t journal_file_entry_array_n_items(Object *o) {
de190aef 1057 assert(o);
b588975f
LP
1058
1059 if (o->object.type != OBJECT_ENTRY_ARRAY)
1060 return 0;
de190aef
LP
1061
1062 return (le64toh(o->object.size) - offsetof(Object, entry_array.items)) / sizeof(uint64_t);
1063}
1064
fb9a24b6
LP
1065uint64_t journal_file_hash_table_n_items(Object *o) {
1066 assert(o);
b588975f
LP
1067
1068 if (o->object.type != OBJECT_DATA_HASH_TABLE &&
1069 o->object.type != OBJECT_FIELD_HASH_TABLE)
1070 return 0;
fb9a24b6
LP
1071
1072 return (le64toh(o->object.size) - offsetof(Object, hash_table.items)) / sizeof(HashItem);
1073}
1074
de190aef 1075static int link_entry_into_array(JournalFile *f,
4fd052ae
FC
1076 le64_t *first,
1077 le64_t *idx,
de190aef 1078 uint64_t p) {
cec736d2 1079 int r;
de190aef
LP
1080 uint64_t n = 0, ap = 0, q, i, a, hidx;
1081 Object *o;
1082
cec736d2 1083 assert(f);
de190aef
LP
1084 assert(first);
1085 assert(idx);
1086 assert(p > 0);
cec736d2 1087
de190aef
LP
1088 a = le64toh(*first);
1089 i = hidx = le64toh(*idx);
1090 while (a > 0) {
1091
1092 r = journal_file_move_to_object(f, OBJECT_ENTRY_ARRAY, a, &o);
1093 if (r < 0)
1094 return r;
cec736d2 1095
de190aef
LP
1096 n = journal_file_entry_array_n_items(o);
1097 if (i < n) {
1098 o->entry_array.items[i] = htole64(p);
1099 *idx = htole64(hidx + 1);
1100 return 0;
1101 }
cec736d2 1102
de190aef
LP
1103 i -= n;
1104 ap = a;
1105 a = le64toh(o->entry_array.next_entry_array_offset);
1106 }
1107
1108 if (hidx > n)
1109 n = (hidx+1) * 2;
1110 else
1111 n = n * 2;
1112
1113 if (n < 4)
1114 n = 4;
1115
1116 r = journal_file_append_object(f, OBJECT_ENTRY_ARRAY,
1117 offsetof(Object, entry_array.items) + n * sizeof(uint64_t),
1118 &o, &q);
cec736d2
LP
1119 if (r < 0)
1120 return r;
1121
feb12d3e 1122#ifdef HAVE_GCRYPT
5996c7c2 1123 r = journal_file_hmac_put_object(f, OBJECT_ENTRY_ARRAY, o, q);
b0af6f41
LP
1124 if (r < 0)
1125 return r;
feb12d3e 1126#endif
b0af6f41 1127
de190aef 1128 o->entry_array.items[i] = htole64(p);
cec736d2 1129
de190aef 1130 if (ap == 0)
7be3aa17 1131 *first = htole64(q);
cec736d2 1132 else {
de190aef 1133 r = journal_file_move_to_object(f, OBJECT_ENTRY_ARRAY, ap, &o);
cec736d2
LP
1134 if (r < 0)
1135 return r;
1136
de190aef
LP
1137 o->entry_array.next_entry_array_offset = htole64(q);
1138 }
cec736d2 1139
2dee23eb
LP
1140 if (JOURNAL_HEADER_CONTAINS(f->header, n_entry_arrays))
1141 f->header->n_entry_arrays = htole64(le64toh(f->header->n_entry_arrays) + 1);
1142
de190aef
LP
1143 *idx = htole64(hidx + 1);
1144
1145 return 0;
1146}
cec736d2 1147
de190aef 1148static int link_entry_into_array_plus_one(JournalFile *f,
4fd052ae
FC
1149 le64_t *extra,
1150 le64_t *first,
1151 le64_t *idx,
de190aef
LP
1152 uint64_t p) {
1153
1154 int r;
1155
1156 assert(f);
1157 assert(extra);
1158 assert(first);
1159 assert(idx);
1160 assert(p > 0);
1161
1162 if (*idx == 0)
1163 *extra = htole64(p);
1164 else {
4fd052ae 1165 le64_t i;
de190aef 1166
7be3aa17 1167 i = htole64(le64toh(*idx) - 1);
de190aef
LP
1168 r = link_entry_into_array(f, first, &i, p);
1169 if (r < 0)
1170 return r;
cec736d2
LP
1171 }
1172
de190aef
LP
1173 *idx = htole64(le64toh(*idx) + 1);
1174 return 0;
1175}
1176
1177static int journal_file_link_entry_item(JournalFile *f, Object *o, uint64_t offset, uint64_t i) {
1178 uint64_t p;
1179 int r;
1180 assert(f);
1181 assert(o);
1182 assert(offset > 0);
1183
1184 p = le64toh(o->entry.items[i].object_offset);
1185 if (p == 0)
1186 return -EINVAL;
1187
1188 r = journal_file_move_to_object(f, OBJECT_DATA, p, &o);
cec736d2
LP
1189 if (r < 0)
1190 return r;
1191
de190aef
LP
1192 return link_entry_into_array_plus_one(f,
1193 &o->data.entry_offset,
1194 &o->data.entry_array_offset,
1195 &o->data.n_entries,
1196 offset);
cec736d2
LP
1197}
1198
1199static int journal_file_link_entry(JournalFile *f, Object *o, uint64_t offset) {
de190aef 1200 uint64_t n, i;
cec736d2
LP
1201 int r;
1202
1203 assert(f);
1204 assert(o);
1205 assert(offset > 0);
b588975f
LP
1206
1207 if (o->object.type != OBJECT_ENTRY)
1208 return -EINVAL;
cec736d2 1209
b788cc23
LP
1210 __sync_synchronize();
1211
cec736d2 1212 /* Link up the entry itself */
de190aef
LP
1213 r = link_entry_into_array(f,
1214 &f->header->entry_array_offset,
1215 &f->header->n_entries,
1216 offset);
1217 if (r < 0)
1218 return r;
cec736d2 1219
507f22bd 1220 /* log_debug("=> %s seqnr=%"PRIu64" n_entries=%"PRIu64, f->path, o->entry.seqnum, f->header->n_entries); */
cec736d2 1221
de190aef 1222 if (f->header->head_entry_realtime == 0)
0ac38b70 1223 f->header->head_entry_realtime = o->entry.realtime;
cec736d2 1224
0ac38b70 1225 f->header->tail_entry_realtime = o->entry.realtime;
de190aef
LP
1226 f->header->tail_entry_monotonic = o->entry.monotonic;
1227
1228 f->tail_entry_monotonic_valid = true;
cec736d2
LP
1229
1230 /* Link up the items */
1231 n = journal_file_entry_n_items(o);
1232 for (i = 0; i < n; i++) {
1233 r = journal_file_link_entry_item(f, o, offset, i);
1234 if (r < 0)
1235 return r;
1236 }
1237
cec736d2
LP
1238 return 0;
1239}
1240
1241static int journal_file_append_entry_internal(
1242 JournalFile *f,
1243 const dual_timestamp *ts,
1244 uint64_t xor_hash,
1245 const EntryItem items[], unsigned n_items,
de190aef 1246 uint64_t *seqnum,
cec736d2
LP
1247 Object **ret, uint64_t *offset) {
1248 uint64_t np;
1249 uint64_t osize;
1250 Object *o;
1251 int r;
1252
1253 assert(f);
1254 assert(items || n_items == 0);
de190aef 1255 assert(ts);
cec736d2
LP
1256
1257 osize = offsetof(Object, entry.items) + (n_items * sizeof(EntryItem));
1258
de190aef 1259 r = journal_file_append_object(f, OBJECT_ENTRY, osize, &o, &np);
cec736d2
LP
1260 if (r < 0)
1261 return r;
1262
d98cc1f2 1263 o->entry.seqnum = htole64(journal_file_entry_seqnum(f, seqnum));
cec736d2 1264 memcpy(o->entry.items, items, n_items * sizeof(EntryItem));
de190aef
LP
1265 o->entry.realtime = htole64(ts->realtime);
1266 o->entry.monotonic = htole64(ts->monotonic);
cec736d2
LP
1267 o->entry.xor_hash = htole64(xor_hash);
1268 o->entry.boot_id = f->header->boot_id;
1269
feb12d3e 1270#ifdef HAVE_GCRYPT
5996c7c2 1271 r = journal_file_hmac_put_object(f, OBJECT_ENTRY, o, np);
b0af6f41
LP
1272 if (r < 0)
1273 return r;
feb12d3e 1274#endif
b0af6f41 1275
cec736d2
LP
1276 r = journal_file_link_entry(f, o, np);
1277 if (r < 0)
1278 return r;
1279
1280 if (ret)
1281 *ret = o;
1282
1283 if (offset)
1284 *offset = np;
1285
1286 return 0;
1287}
1288
cf244689 1289void journal_file_post_change(JournalFile *f) {
50f20cfd
LP
1290 assert(f);
1291
1292 /* inotify() does not receive IN_MODIFY events from file
1293 * accesses done via mmap(). After each access we hence
1294 * trigger IN_MODIFY by truncating the journal file to its
1295 * current size which triggers IN_MODIFY. */
1296
bc85bfee
LP
1297 __sync_synchronize();
1298
50f20cfd 1299 if (ftruncate(f->fd, f->last_stat.st_size) < 0)
56f64d95 1300 log_error_errno(errno, "Failed to truncate file to its own size: %m");
50f20cfd
LP
1301}
1302
1f2da9ec
LP
1303static int entry_item_cmp(const void *_a, const void *_b) {
1304 const EntryItem *a = _a, *b = _b;
1305
1306 if (le64toh(a->object_offset) < le64toh(b->object_offset))
1307 return -1;
1308 if (le64toh(a->object_offset) > le64toh(b->object_offset))
1309 return 1;
1310 return 0;
1311}
1312
de190aef 1313int journal_file_append_entry(JournalFile *f, const dual_timestamp *ts, const struct iovec iovec[], unsigned n_iovec, uint64_t *seqnum, Object **ret, uint64_t *offset) {
cec736d2
LP
1314 unsigned i;
1315 EntryItem *items;
1316 int r;
1317 uint64_t xor_hash = 0;
de190aef 1318 struct dual_timestamp _ts;
cec736d2
LP
1319
1320 assert(f);
1321 assert(iovec || n_iovec == 0);
1322
de190aef
LP
1323 if (!ts) {
1324 dual_timestamp_get(&_ts);
1325 ts = &_ts;
1326 }
1327
1328 if (f->tail_entry_monotonic_valid &&
1329 ts->monotonic < le64toh(f->header->tail_entry_monotonic))
1330 return -EINVAL;
1331
feb12d3e 1332#ifdef HAVE_GCRYPT
7560fffc
LP
1333 r = journal_file_maybe_append_tag(f, ts->realtime);
1334 if (r < 0)
1335 return r;
feb12d3e 1336#endif
7560fffc 1337
64825d3c 1338 /* alloca() can't take 0, hence let's allocate at least one */
9607d947 1339 items = alloca(sizeof(EntryItem) * MAX(1u, n_iovec));
cec736d2
LP
1340
1341 for (i = 0; i < n_iovec; i++) {
1342 uint64_t p;
1343 Object *o;
1344
1345 r = journal_file_append_data(f, iovec[i].iov_base, iovec[i].iov_len, &o, &p);
1346 if (r < 0)
cf244689 1347 return r;
cec736d2
LP
1348
1349 xor_hash ^= le64toh(o->data.hash);
1350 items[i].object_offset = htole64(p);
de7b95cd 1351 items[i].hash = o->data.hash;
cec736d2
LP
1352 }
1353
1f2da9ec
LP
1354 /* Order by the position on disk, in order to improve seek
1355 * times for rotating media. */
7ff7394d 1356 qsort_safe(items, n_iovec, sizeof(EntryItem), entry_item_cmp);
1f2da9ec 1357
de190aef 1358 r = journal_file_append_entry_internal(f, ts, xor_hash, items, n_iovec, seqnum, ret, offset);
cec736d2 1359
50f20cfd
LP
1360 journal_file_post_change(f);
1361
cec736d2
LP
1362 return r;
1363}
1364
a4bcff5b 1365typedef struct ChainCacheItem {
fb099c8d 1366 uint64_t first; /* the array at the beginning of the chain */
a4bcff5b
LP
1367 uint64_t array; /* the cached array */
1368 uint64_t begin; /* the first item in the cached array */
1369 uint64_t total; /* the total number of items in all arrays before this one in the chain */
f268980d 1370 uint64_t last_index; /* the last index we looked at, to optimize locality when bisecting */
a4bcff5b
LP
1371} ChainCacheItem;
1372
1373static void chain_cache_put(
4743015d 1374 OrderedHashmap *h,
a4bcff5b
LP
1375 ChainCacheItem *ci,
1376 uint64_t first,
1377 uint64_t array,
1378 uint64_t begin,
f268980d
LP
1379 uint64_t total,
1380 uint64_t last_index) {
a4bcff5b
LP
1381
1382 if (!ci) {
34741aa3
LP
1383 /* If the chain item to cache for this chain is the
1384 * first one it's not worth caching anything */
1385 if (array == first)
1386 return;
1387
29433089 1388 if (ordered_hashmap_size(h) >= CHAIN_CACHE_MAX) {
4743015d 1389 ci = ordered_hashmap_steal_first(h);
29433089
LP
1390 assert(ci);
1391 } else {
a4bcff5b
LP
1392 ci = new(ChainCacheItem, 1);
1393 if (!ci)
1394 return;
1395 }
1396
1397 ci->first = first;
1398
4743015d 1399 if (ordered_hashmap_put(h, &ci->first, ci) < 0) {
a4bcff5b
LP
1400 free(ci);
1401 return;
1402 }
1403 } else
1404 assert(ci->first == first);
1405
1406 ci->array = array;
1407 ci->begin = begin;
1408 ci->total = total;
f268980d 1409 ci->last_index = last_index;
a4bcff5b
LP
1410}
1411
f268980d
LP
1412static int generic_array_get(
1413 JournalFile *f,
1414 uint64_t first,
1415 uint64_t i,
1416 Object **ret, uint64_t *offset) {
de190aef 1417
cec736d2 1418 Object *o;
a4bcff5b 1419 uint64_t p = 0, a, t = 0;
cec736d2 1420 int r;
a4bcff5b 1421 ChainCacheItem *ci;
cec736d2
LP
1422
1423 assert(f);
1424
de190aef 1425 a = first;
a4bcff5b
LP
1426
1427 /* Try the chain cache first */
4743015d 1428 ci = ordered_hashmap_get(f->chain_cache, &first);
a4bcff5b
LP
1429 if (ci && i > ci->total) {
1430 a = ci->array;
1431 i -= ci->total;
1432 t = ci->total;
1433 }
1434
de190aef 1435 while (a > 0) {
a4bcff5b 1436 uint64_t k;
cec736d2 1437
de190aef
LP
1438 r = journal_file_move_to_object(f, OBJECT_ENTRY_ARRAY, a, &o);
1439 if (r < 0)
1440 return r;
cec736d2 1441
a4bcff5b
LP
1442 k = journal_file_entry_array_n_items(o);
1443 if (i < k) {
de190aef 1444 p = le64toh(o->entry_array.items[i]);
a4bcff5b 1445 goto found;
cec736d2
LP
1446 }
1447
a4bcff5b
LP
1448 i -= k;
1449 t += k;
de190aef
LP
1450 a = le64toh(o->entry_array.next_entry_array_offset);
1451 }
1452
a4bcff5b
LP
1453 return 0;
1454
1455found:
1456 /* Let's cache this item for the next invocation */
af13a6b0 1457 chain_cache_put(f->chain_cache, ci, first, a, le64toh(o->entry_array.items[0]), t, i);
de190aef
LP
1458
1459 r = journal_file_move_to_object(f, OBJECT_ENTRY, p, &o);
1460 if (r < 0)
1461 return r;
1462
1463 if (ret)
1464 *ret = o;
1465
1466 if (offset)
1467 *offset = p;
1468
1469 return 1;
1470}
1471
f268980d
LP
1472static int generic_array_get_plus_one(
1473 JournalFile *f,
1474 uint64_t extra,
1475 uint64_t first,
1476 uint64_t i,
1477 Object **ret, uint64_t *offset) {
de190aef
LP
1478
1479 Object *o;
1480
1481 assert(f);
1482
1483 if (i == 0) {
1484 int r;
1485
1486 r = journal_file_move_to_object(f, OBJECT_ENTRY, extra, &o);
cec736d2
LP
1487 if (r < 0)
1488 return r;
1489
de190aef
LP
1490 if (ret)
1491 *ret = o;
cec736d2 1492
de190aef
LP
1493 if (offset)
1494 *offset = extra;
cec736d2 1495
de190aef 1496 return 1;
cec736d2
LP
1497 }
1498
de190aef
LP
1499 return generic_array_get(f, first, i-1, ret, offset);
1500}
cec736d2 1501
de190aef
LP
1502enum {
1503 TEST_FOUND,
1504 TEST_LEFT,
1505 TEST_RIGHT
1506};
cec736d2 1507
f268980d
LP
1508static int generic_array_bisect(
1509 JournalFile *f,
1510 uint64_t first,
1511 uint64_t n,
1512 uint64_t needle,
1513 int (*test_object)(JournalFile *f, uint64_t p, uint64_t needle),
1514 direction_t direction,
1515 Object **ret,
1516 uint64_t *offset,
1517 uint64_t *idx) {
1518
1519 uint64_t a, p, t = 0, i = 0, last_p = 0, last_index = (uint64_t) -1;
de190aef
LP
1520 bool subtract_one = false;
1521 Object *o, *array = NULL;
1522 int r;
a4bcff5b 1523 ChainCacheItem *ci;
cec736d2 1524
de190aef
LP
1525 assert(f);
1526 assert(test_object);
cec736d2 1527
a4bcff5b 1528 /* Start with the first array in the chain */
de190aef 1529 a = first;
a4bcff5b 1530
4743015d 1531 ci = ordered_hashmap_get(f->chain_cache, &first);
a4bcff5b
LP
1532 if (ci && n > ci->total) {
1533 /* Ah, we have iterated this bisection array chain
1534 * previously! Let's see if we can skip ahead in the
1535 * chain, as far as the last time. But we can't jump
1536 * backwards in the chain, so let's check that
1537 * first. */
1538
1539 r = test_object(f, ci->begin, needle);
1540 if (r < 0)
1541 return r;
1542
1543 if (r == TEST_LEFT) {
f268980d 1544 /* OK, what we are looking for is right of the
a4bcff5b
LP
1545 * begin of this EntryArray, so let's jump
1546 * straight to previously cached array in the
1547 * chain */
1548
1549 a = ci->array;
1550 n -= ci->total;
1551 t = ci->total;
f268980d 1552 last_index = ci->last_index;
a4bcff5b
LP
1553 }
1554 }
1555
de190aef
LP
1556 while (a > 0) {
1557 uint64_t left, right, k, lp;
1558
1559 r = journal_file_move_to_object(f, OBJECT_ENTRY_ARRAY, a, &array);
cec736d2
LP
1560 if (r < 0)
1561 return r;
1562
de190aef
LP
1563 k = journal_file_entry_array_n_items(array);
1564 right = MIN(k, n);
1565 if (right <= 0)
1566 return 0;
cec736d2 1567
de190aef
LP
1568 i = right - 1;
1569 lp = p = le64toh(array->entry_array.items[i]);
1570 if (p <= 0)
1571 return -EBADMSG;
cec736d2 1572
de190aef
LP
1573 r = test_object(f, p, needle);
1574 if (r < 0)
1575 return r;
cec736d2 1576
de190aef
LP
1577 if (r == TEST_FOUND)
1578 r = direction == DIRECTION_DOWN ? TEST_RIGHT : TEST_LEFT;
1579
1580 if (r == TEST_RIGHT) {
1581 left = 0;
1582 right -= 1;
f268980d
LP
1583
1584 if (last_index != (uint64_t) -1) {
1585 assert(last_index <= right);
1586
1587 /* If we cached the last index we
1588 * looked at, let's try to not to jump
1589 * too wildly around and see if we can
1590 * limit the range to look at early to
1591 * the immediate neighbors of the last
1592 * index we looked at. */
1593
1594 if (last_index > 0) {
1595 uint64_t x = last_index - 1;
1596
1597 p = le64toh(array->entry_array.items[x]);
1598 if (p <= 0)
1599 return -EBADMSG;
1600
1601 r = test_object(f, p, needle);
1602 if (r < 0)
1603 return r;
1604
1605 if (r == TEST_FOUND)
1606 r = direction == DIRECTION_DOWN ? TEST_RIGHT : TEST_LEFT;
1607
1608 if (r == TEST_RIGHT)
1609 right = x;
1610 else
1611 left = x + 1;
1612 }
1613
1614 if (last_index < right) {
1615 uint64_t y = last_index + 1;
1616
1617 p = le64toh(array->entry_array.items[y]);
1618 if (p <= 0)
1619 return -EBADMSG;
1620
1621 r = test_object(f, p, needle);
1622 if (r < 0)
1623 return r;
1624
1625 if (r == TEST_FOUND)
1626 r = direction == DIRECTION_DOWN ? TEST_RIGHT : TEST_LEFT;
1627
1628 if (r == TEST_RIGHT)
1629 right = y;
1630 else
1631 left = y + 1;
1632 }
f268980d
LP
1633 }
1634
de190aef
LP
1635 for (;;) {
1636 if (left == right) {
1637 if (direction == DIRECTION_UP)
1638 subtract_one = true;
1639
1640 i = left;
1641 goto found;
1642 }
1643
1644 assert(left < right);
de190aef 1645 i = (left + right) / 2;
f268980d 1646
de190aef
LP
1647 p = le64toh(array->entry_array.items[i]);
1648 if (p <= 0)
1649 return -EBADMSG;
1650
1651 r = test_object(f, p, needle);
1652 if (r < 0)
1653 return r;
cec736d2 1654
de190aef
LP
1655 if (r == TEST_FOUND)
1656 r = direction == DIRECTION_DOWN ? TEST_RIGHT : TEST_LEFT;
1657
1658 if (r == TEST_RIGHT)
1659 right = i;
1660 else
1661 left = i + 1;
1662 }
1663 }
1664
2173cbf8 1665 if (k >= n) {
cbdca852
LP
1666 if (direction == DIRECTION_UP) {
1667 i = n;
1668 subtract_one = true;
1669 goto found;
1670 }
1671
cec736d2 1672 return 0;
cbdca852 1673 }
cec736d2 1674
de190aef
LP
1675 last_p = lp;
1676
1677 n -= k;
1678 t += k;
f268980d 1679 last_index = (uint64_t) -1;
de190aef 1680 a = le64toh(array->entry_array.next_entry_array_offset);
cec736d2
LP
1681 }
1682
1683 return 0;
de190aef
LP
1684
1685found:
1686 if (subtract_one && t == 0 && i == 0)
1687 return 0;
1688
a4bcff5b 1689 /* Let's cache this item for the next invocation */
af13a6b0 1690 chain_cache_put(f->chain_cache, ci, first, a, le64toh(array->entry_array.items[0]), t, subtract_one ? (i > 0 ? i-1 : (uint64_t) -1) : i);
a4bcff5b 1691
de190aef
LP
1692 if (subtract_one && i == 0)
1693 p = last_p;
1694 else if (subtract_one)
1695 p = le64toh(array->entry_array.items[i-1]);
1696 else
1697 p = le64toh(array->entry_array.items[i]);
1698
1699 r = journal_file_move_to_object(f, OBJECT_ENTRY, p, &o);
1700 if (r < 0)
1701 return r;
1702
1703 if (ret)
1704 *ret = o;
1705
1706 if (offset)
1707 *offset = p;
1708
1709 if (idx)
cbdca852 1710 *idx = t + i + (subtract_one ? -1 : 0);
de190aef
LP
1711
1712 return 1;
cec736d2
LP
1713}
1714
f268980d
LP
1715
1716static int generic_array_bisect_plus_one(
1717 JournalFile *f,
1718 uint64_t extra,
1719 uint64_t first,
1720 uint64_t n,
1721 uint64_t needle,
1722 int (*test_object)(JournalFile *f, uint64_t p, uint64_t needle),
1723 direction_t direction,
1724 Object **ret,
1725 uint64_t *offset,
1726 uint64_t *idx) {
de190aef 1727
cec736d2 1728 int r;
cbdca852
LP
1729 bool step_back = false;
1730 Object *o;
cec736d2
LP
1731
1732 assert(f);
de190aef 1733 assert(test_object);
cec736d2 1734
de190aef
LP
1735 if (n <= 0)
1736 return 0;
cec736d2 1737
de190aef
LP
1738 /* This bisects the array in object 'first', but first checks
1739 * an extra */
de190aef
LP
1740 r = test_object(f, extra, needle);
1741 if (r < 0)
1742 return r;
a536e261
LP
1743
1744 if (r == TEST_FOUND)
1745 r = direction == DIRECTION_DOWN ? TEST_RIGHT : TEST_LEFT;
1746
cbdca852
LP
1747 /* if we are looking with DIRECTION_UP then we need to first
1748 see if in the actual array there is a matching entry, and
1749 return the last one of that. But if there isn't any we need
1750 to return this one. Hence remember this, and return it
1751 below. */
1752 if (r == TEST_LEFT)
1753 step_back = direction == DIRECTION_UP;
de190aef 1754
cbdca852
LP
1755 if (r == TEST_RIGHT) {
1756 if (direction == DIRECTION_DOWN)
1757 goto found;
1758 else
1759 return 0;
a536e261 1760 }
cec736d2 1761
de190aef
LP
1762 r = generic_array_bisect(f, first, n-1, needle, test_object, direction, ret, offset, idx);
1763
cbdca852
LP
1764 if (r == 0 && step_back)
1765 goto found;
1766
ecf68b1d 1767 if (r > 0 && idx)
de190aef
LP
1768 (*idx) ++;
1769
1770 return r;
cbdca852
LP
1771
1772found:
1773 r = journal_file_move_to_object(f, OBJECT_ENTRY, extra, &o);
1774 if (r < 0)
1775 return r;
1776
1777 if (ret)
1778 *ret = o;
1779
1780 if (offset)
1781 *offset = extra;
1782
1783 if (idx)
1784 *idx = 0;
1785
1786 return 1;
1787}
1788
44a6b1b6 1789_pure_ static int test_object_offset(JournalFile *f, uint64_t p, uint64_t needle) {
cbdca852
LP
1790 assert(f);
1791 assert(p > 0);
1792
1793 if (p == needle)
1794 return TEST_FOUND;
1795 else if (p < needle)
1796 return TEST_LEFT;
1797 else
1798 return TEST_RIGHT;
1799}
1800
1801int journal_file_move_to_entry_by_offset(
1802 JournalFile *f,
1803 uint64_t p,
1804 direction_t direction,
1805 Object **ret,
1806 uint64_t *offset) {
1807
1808 return generic_array_bisect(f,
1809 le64toh(f->header->entry_array_offset),
1810 le64toh(f->header->n_entries),
1811 p,
1812 test_object_offset,
1813 direction,
1814 ret, offset, NULL);
de190aef
LP
1815}
1816
cbdca852 1817
de190aef
LP
1818static int test_object_seqnum(JournalFile *f, uint64_t p, uint64_t needle) {
1819 Object *o;
1820 int r;
1821
1822 assert(f);
1823 assert(p > 0);
1824
1825 r = journal_file_move_to_object(f, OBJECT_ENTRY, p, &o);
cec736d2
LP
1826 if (r < 0)
1827 return r;
1828
de190aef
LP
1829 if (le64toh(o->entry.seqnum) == needle)
1830 return TEST_FOUND;
1831 else if (le64toh(o->entry.seqnum) < needle)
1832 return TEST_LEFT;
1833 else
1834 return TEST_RIGHT;
1835}
cec736d2 1836
de190aef
LP
1837int journal_file_move_to_entry_by_seqnum(
1838 JournalFile *f,
1839 uint64_t seqnum,
1840 direction_t direction,
1841 Object **ret,
1842 uint64_t *offset) {
1843
1844 return generic_array_bisect(f,
1845 le64toh(f->header->entry_array_offset),
1846 le64toh(f->header->n_entries),
1847 seqnum,
1848 test_object_seqnum,
1849 direction,
1850 ret, offset, NULL);
1851}
cec736d2 1852
de190aef
LP
1853static int test_object_realtime(JournalFile *f, uint64_t p, uint64_t needle) {
1854 Object *o;
1855 int r;
1856
1857 assert(f);
1858 assert(p > 0);
1859
1860 r = journal_file_move_to_object(f, OBJECT_ENTRY, p, &o);
1861 if (r < 0)
1862 return r;
1863
1864 if (le64toh(o->entry.realtime) == needle)
1865 return TEST_FOUND;
1866 else if (le64toh(o->entry.realtime) < needle)
1867 return TEST_LEFT;
1868 else
1869 return TEST_RIGHT;
cec736d2
LP
1870}
1871
de190aef
LP
1872int journal_file_move_to_entry_by_realtime(
1873 JournalFile *f,
1874 uint64_t realtime,
1875 direction_t direction,
1876 Object **ret,
1877 uint64_t *offset) {
1878
1879 return generic_array_bisect(f,
1880 le64toh(f->header->entry_array_offset),
1881 le64toh(f->header->n_entries),
1882 realtime,
1883 test_object_realtime,
1884 direction,
1885 ret, offset, NULL);
1886}
1887
1888static int test_object_monotonic(JournalFile *f, uint64_t p, uint64_t needle) {
1889 Object *o;
1890 int r;
1891
1892 assert(f);
1893 assert(p > 0);
1894
1895 r = journal_file_move_to_object(f, OBJECT_ENTRY, p, &o);
1896 if (r < 0)
1897 return r;
1898
1899 if (le64toh(o->entry.monotonic) == needle)
1900 return TEST_FOUND;
1901 else if (le64toh(o->entry.monotonic) < needle)
1902 return TEST_LEFT;
1903 else
1904 return TEST_RIGHT;
1905}
1906
47838ab3
ZJS
1907static inline int find_data_object_by_boot_id(
1908 JournalFile *f,
1909 sd_id128_t boot_id,
1910 Object **o,
1911 uint64_t *b) {
1912 char t[sizeof("_BOOT_ID=")-1 + 32 + 1] = "_BOOT_ID=";
1913
1914 sd_id128_to_string(boot_id, t + 9);
1915 return journal_file_find_data_object(f, t, sizeof(t) - 1, o, b);
1916}
1917
de190aef
LP
1918int journal_file_move_to_entry_by_monotonic(
1919 JournalFile *f,
1920 sd_id128_t boot_id,
1921 uint64_t monotonic,
1922 direction_t direction,
1923 Object **ret,
1924 uint64_t *offset) {
1925
de190aef
LP
1926 Object *o;
1927 int r;
1928
cbdca852 1929 assert(f);
de190aef 1930
47838ab3 1931 r = find_data_object_by_boot_id(f, boot_id, &o, NULL);
de190aef
LP
1932 if (r < 0)
1933 return r;
cbdca852 1934 if (r == 0)
de190aef
LP
1935 return -ENOENT;
1936
1937 return generic_array_bisect_plus_one(f,
1938 le64toh(o->data.entry_offset),
1939 le64toh(o->data.entry_array_offset),
1940 le64toh(o->data.n_entries),
1941 monotonic,
1942 test_object_monotonic,
1943 direction,
1944 ret, offset, NULL);
1945}
1946
de190aef
LP
1947int journal_file_next_entry(
1948 JournalFile *f,
1949 Object *o, uint64_t p,
1950 direction_t direction,
1951 Object **ret, uint64_t *offset) {
1952
fb099c8d 1953 uint64_t i, n, ofs;
cec736d2
LP
1954 int r;
1955
1956 assert(f);
de190aef
LP
1957 assert(p > 0 || !o);
1958
1959 n = le64toh(f->header->n_entries);
1960 if (n <= 0)
1961 return 0;
cec736d2
LP
1962
1963 if (!o)
de190aef 1964 i = direction == DIRECTION_DOWN ? 0 : n - 1;
cec736d2 1965 else {
de190aef 1966 if (o->object.type != OBJECT_ENTRY)
cec736d2
LP
1967 return -EINVAL;
1968
de190aef
LP
1969 r = generic_array_bisect(f,
1970 le64toh(f->header->entry_array_offset),
1971 le64toh(f->header->n_entries),
1972 p,
1973 test_object_offset,
1974 DIRECTION_DOWN,
1975 NULL, NULL,
1976 &i);
1977 if (r <= 0)
1978 return r;
1979
1980 if (direction == DIRECTION_DOWN) {
1981 if (i >= n - 1)
1982 return 0;
1983
1984 i++;
1985 } else {
1986 if (i <= 0)
1987 return 0;
1988
1989 i--;
1990 }
cec736d2
LP
1991 }
1992
de190aef 1993 /* And jump to it */
fb099c8d
ZJS
1994 r = generic_array_get(f,
1995 le64toh(f->header->entry_array_offset),
1996 i,
1997 ret, &ofs);
1998 if (r <= 0)
1999 return r;
2000
2001 if (p > 0 &&
2002 (direction == DIRECTION_DOWN ? ofs <= p : ofs >= p)) {
2003 log_debug("%s: entry array corrupted at entry %"PRIu64,
2004 f->path, i);
2005 return -EBADMSG;
2006 }
2007
2008 if (offset)
2009 *offset = ofs;
2010
2011 return 1;
de190aef 2012}
cec736d2 2013
de190aef
LP
2014int journal_file_skip_entry(
2015 JournalFile *f,
2016 Object *o, uint64_t p,
2017 int64_t skip,
2018 Object **ret, uint64_t *offset) {
2019
2020 uint64_t i, n;
2021 int r;
2022
2023 assert(f);
2024 assert(o);
2025 assert(p > 0);
2026
2027 if (o->object.type != OBJECT_ENTRY)
2028 return -EINVAL;
2029
2030 r = generic_array_bisect(f,
2031 le64toh(f->header->entry_array_offset),
2032 le64toh(f->header->n_entries),
2033 p,
2034 test_object_offset,
2035 DIRECTION_DOWN,
2036 NULL, NULL,
2037 &i);
2038 if (r <= 0)
cec736d2
LP
2039 return r;
2040
de190aef
LP
2041 /* Calculate new index */
2042 if (skip < 0) {
2043 if ((uint64_t) -skip >= i)
2044 i = 0;
2045 else
2046 i = i - (uint64_t) -skip;
2047 } else
2048 i += (uint64_t) skip;
cec736d2 2049
de190aef
LP
2050 n = le64toh(f->header->n_entries);
2051 if (n <= 0)
2052 return -EBADMSG;
cec736d2 2053
de190aef
LP
2054 if (i >= n)
2055 i = n-1;
2056
2057 return generic_array_get(f,
2058 le64toh(f->header->entry_array_offset),
2059 i,
2060 ret, offset);
cec736d2
LP
2061}
2062
de190aef
LP
2063int journal_file_next_entry_for_data(
2064 JournalFile *f,
2065 Object *o, uint64_t p,
2066 uint64_t data_offset,
2067 direction_t direction,
2068 Object **ret, uint64_t *offset) {
2069
2070 uint64_t n, i;
cec736d2 2071 int r;
de190aef 2072 Object *d;
cec736d2
LP
2073
2074 assert(f);
de190aef 2075 assert(p > 0 || !o);
cec736d2 2076
de190aef 2077 r = journal_file_move_to_object(f, OBJECT_DATA, data_offset, &d);
466ccd92 2078 if (r < 0)
de190aef 2079 return r;
cec736d2 2080
de190aef
LP
2081 n = le64toh(d->data.n_entries);
2082 if (n <= 0)
2083 return n;
cec736d2 2084
de190aef
LP
2085 if (!o)
2086 i = direction == DIRECTION_DOWN ? 0 : n - 1;
2087 else {
2088 if (o->object.type != OBJECT_ENTRY)
2089 return -EINVAL;
cec736d2 2090
de190aef
LP
2091 r = generic_array_bisect_plus_one(f,
2092 le64toh(d->data.entry_offset),
2093 le64toh(d->data.entry_array_offset),
2094 le64toh(d->data.n_entries),
2095 p,
2096 test_object_offset,
2097 DIRECTION_DOWN,
2098 NULL, NULL,
2099 &i);
2100
2101 if (r <= 0)
cec736d2
LP
2102 return r;
2103
de190aef
LP
2104 if (direction == DIRECTION_DOWN) {
2105 if (i >= n - 1)
2106 return 0;
cec736d2 2107
de190aef
LP
2108 i++;
2109 } else {
2110 if (i <= 0)
2111 return 0;
cec736d2 2112
de190aef
LP
2113 i--;
2114 }
cec736d2 2115
de190aef 2116 }
cec736d2 2117
de190aef
LP
2118 return generic_array_get_plus_one(f,
2119 le64toh(d->data.entry_offset),
2120 le64toh(d->data.entry_array_offset),
2121 i,
2122 ret, offset);
2123}
cec736d2 2124
cbdca852
LP
2125int journal_file_move_to_entry_by_offset_for_data(
2126 JournalFile *f,
2127 uint64_t data_offset,
2128 uint64_t p,
2129 direction_t direction,
2130 Object **ret, uint64_t *offset) {
2131
2132 int r;
2133 Object *d;
2134
2135 assert(f);
2136
2137 r = journal_file_move_to_object(f, OBJECT_DATA, data_offset, &d);
2138 if (r < 0)
2139 return r;
2140
2141 return generic_array_bisect_plus_one(f,
2142 le64toh(d->data.entry_offset),
2143 le64toh(d->data.entry_array_offset),
2144 le64toh(d->data.n_entries),
2145 p,
2146 test_object_offset,
2147 direction,
2148 ret, offset, NULL);
2149}
2150
2151int journal_file_move_to_entry_by_monotonic_for_data(
2152 JournalFile *f,
2153 uint64_t data_offset,
2154 sd_id128_t boot_id,
2155 uint64_t monotonic,
2156 direction_t direction,
2157 Object **ret, uint64_t *offset) {
2158
cbdca852
LP
2159 Object *o, *d;
2160 int r;
2161 uint64_t b, z;
2162
2163 assert(f);
2164
2165 /* First, seek by time */
47838ab3 2166 r = find_data_object_by_boot_id(f, boot_id, &o, &b);
cbdca852
LP
2167 if (r < 0)
2168 return r;
2169 if (r == 0)
2170 return -ENOENT;
2171
2172 r = generic_array_bisect_plus_one(f,
2173 le64toh(o->data.entry_offset),
2174 le64toh(o->data.entry_array_offset),
2175 le64toh(o->data.n_entries),
2176 monotonic,
2177 test_object_monotonic,
2178 direction,
2179 NULL, &z, NULL);
2180 if (r <= 0)
2181 return r;
2182
2183 /* And now, continue seeking until we find an entry that
2184 * exists in both bisection arrays */
2185
2186 for (;;) {
2187 Object *qo;
2188 uint64_t p, q;
2189
2190 r = journal_file_move_to_object(f, OBJECT_DATA, data_offset, &d);
2191 if (r < 0)
2192 return r;
2193
2194 r = generic_array_bisect_plus_one(f,
2195 le64toh(d->data.entry_offset),
2196 le64toh(d->data.entry_array_offset),
2197 le64toh(d->data.n_entries),
2198 z,
2199 test_object_offset,
2200 direction,
2201 NULL, &p, NULL);
2202 if (r <= 0)
2203 return r;
2204
2205 r = journal_file_move_to_object(f, OBJECT_DATA, b, &o);
2206 if (r < 0)
2207 return r;
2208
2209 r = generic_array_bisect_plus_one(f,
2210 le64toh(o->data.entry_offset),
2211 le64toh(o->data.entry_array_offset),
2212 le64toh(o->data.n_entries),
2213 p,
2214 test_object_offset,
2215 direction,
2216 &qo, &q, NULL);
2217
2218 if (r <= 0)
2219 return r;
2220
2221 if (p == q) {
2222 if (ret)
2223 *ret = qo;
2224 if (offset)
2225 *offset = q;
2226
2227 return 1;
2228 }
2229
2230 z = q;
2231 }
cbdca852
LP
2232}
2233
de190aef
LP
2234int journal_file_move_to_entry_by_seqnum_for_data(
2235 JournalFile *f,
2236 uint64_t data_offset,
2237 uint64_t seqnum,
2238 direction_t direction,
2239 Object **ret, uint64_t *offset) {
cec736d2 2240
de190aef
LP
2241 Object *d;
2242 int r;
cec736d2 2243
91a31dde
LP
2244 assert(f);
2245
de190aef 2246 r = journal_file_move_to_object(f, OBJECT_DATA, data_offset, &d);
91a31dde 2247 if (r < 0)
de190aef 2248 return r;
cec736d2 2249
de190aef
LP
2250 return generic_array_bisect_plus_one(f,
2251 le64toh(d->data.entry_offset),
2252 le64toh(d->data.entry_array_offset),
2253 le64toh(d->data.n_entries),
2254 seqnum,
2255 test_object_seqnum,
2256 direction,
2257 ret, offset, NULL);
2258}
cec736d2 2259
de190aef
LP
2260int journal_file_move_to_entry_by_realtime_for_data(
2261 JournalFile *f,
2262 uint64_t data_offset,
2263 uint64_t realtime,
2264 direction_t direction,
2265 Object **ret, uint64_t *offset) {
2266
2267 Object *d;
2268 int r;
2269
91a31dde
LP
2270 assert(f);
2271
de190aef 2272 r = journal_file_move_to_object(f, OBJECT_DATA, data_offset, &d);
91a31dde 2273 if (r < 0)
de190aef
LP
2274 return r;
2275
2276 return generic_array_bisect_plus_one(f,
2277 le64toh(d->data.entry_offset),
2278 le64toh(d->data.entry_array_offset),
2279 le64toh(d->data.n_entries),
2280 realtime,
2281 test_object_realtime,
2282 direction,
2283 ret, offset, NULL);
cec736d2
LP
2284}
2285
0284adc6 2286void journal_file_dump(JournalFile *f) {
7560fffc 2287 Object *o;
7560fffc 2288 int r;
0284adc6 2289 uint64_t p;
7560fffc
LP
2290
2291 assert(f);
2292
0284adc6 2293 journal_file_print_header(f);
7560fffc 2294
0284adc6
LP
2295 p = le64toh(f->header->header_size);
2296 while (p != 0) {
d05089d8 2297 r = journal_file_move_to_object(f, OBJECT_UNUSED, p, &o);
0284adc6
LP
2298 if (r < 0)
2299 goto fail;
7560fffc 2300
0284adc6 2301 switch (o->object.type) {
d98cc1f2 2302
0284adc6
LP
2303 case OBJECT_UNUSED:
2304 printf("Type: OBJECT_UNUSED\n");
2305 break;
d98cc1f2 2306
0284adc6
LP
2307 case OBJECT_DATA:
2308 printf("Type: OBJECT_DATA\n");
2309 break;
7560fffc 2310
3c1668da
LP
2311 case OBJECT_FIELD:
2312 printf("Type: OBJECT_FIELD\n");
2313 break;
2314
0284adc6 2315 case OBJECT_ENTRY:
507f22bd
ZJS
2316 printf("Type: OBJECT_ENTRY seqnum=%"PRIu64" monotonic=%"PRIu64" realtime=%"PRIu64"\n",
2317 le64toh(o->entry.seqnum),
2318 le64toh(o->entry.monotonic),
2319 le64toh(o->entry.realtime));
0284adc6 2320 break;
7560fffc 2321
0284adc6
LP
2322 case OBJECT_FIELD_HASH_TABLE:
2323 printf("Type: OBJECT_FIELD_HASH_TABLE\n");
2324 break;
7560fffc 2325
0284adc6
LP
2326 case OBJECT_DATA_HASH_TABLE:
2327 printf("Type: OBJECT_DATA_HASH_TABLE\n");
2328 break;
7560fffc 2329
0284adc6
LP
2330 case OBJECT_ENTRY_ARRAY:
2331 printf("Type: OBJECT_ENTRY_ARRAY\n");
2332 break;
7560fffc 2333
0284adc6 2334 case OBJECT_TAG:
507f22bd
ZJS
2335 printf("Type: OBJECT_TAG seqnum=%"PRIu64" epoch=%"PRIu64"\n",
2336 le64toh(o->tag.seqnum),
2337 le64toh(o->tag.epoch));
0284adc6 2338 break;
3c1668da
LP
2339
2340 default:
2341 printf("Type: unknown (%u)\n", o->object.type);
2342 break;
0284adc6 2343 }
7560fffc 2344
d89c8fdf
ZJS
2345 if (o->object.flags & OBJECT_COMPRESSION_MASK)
2346 printf("Flags: %s\n",
2347 object_compressed_to_string(o->object.flags & OBJECT_COMPRESSION_MASK));
7560fffc 2348
0284adc6
LP
2349 if (p == le64toh(f->header->tail_object_offset))
2350 p = 0;
2351 else
2352 p = p + ALIGN64(le64toh(o->object.size));
2353 }
7560fffc 2354
0284adc6
LP
2355 return;
2356fail:
2357 log_error("File corrupt");
7560fffc
LP
2358}
2359
718fe4b1
ZJS
2360static const char* format_timestamp_safe(char *buf, size_t l, usec_t t) {
2361 const char *x;
2362
2363 x = format_timestamp(buf, l, t);
2364 if (x)
2365 return x;
2366 return " --- ";
2367}
2368
0284adc6 2369void journal_file_print_header(JournalFile *f) {
2765b7bb 2370 char a[33], b[33], c[33], d[33];
ed375beb 2371 char x[FORMAT_TIMESTAMP_MAX], y[FORMAT_TIMESTAMP_MAX], z[FORMAT_TIMESTAMP_MAX];
a1a03e30
LP
2372 struct stat st;
2373 char bytes[FORMAT_BYTES_MAX];
7560fffc
LP
2374
2375 assert(f);
7560fffc 2376
0284adc6
LP
2377 printf("File Path: %s\n"
2378 "File ID: %s\n"
2379 "Machine ID: %s\n"
2380 "Boot ID: %s\n"
2381 "Sequential Number ID: %s\n"
2382 "State: %s\n"
2383 "Compatible Flags:%s%s\n"
d89c8fdf 2384 "Incompatible Flags:%s%s%s\n"
507f22bd
ZJS
2385 "Header size: %"PRIu64"\n"
2386 "Arena size: %"PRIu64"\n"
2387 "Data Hash Table Size: %"PRIu64"\n"
2388 "Field Hash Table Size: %"PRIu64"\n"
0284adc6 2389 "Rotate Suggested: %s\n"
507f22bd
ZJS
2390 "Head Sequential Number: %"PRIu64"\n"
2391 "Tail Sequential Number: %"PRIu64"\n"
0284adc6 2392 "Head Realtime Timestamp: %s\n"
3223f44f 2393 "Tail Realtime Timestamp: %s\n"
ed375beb 2394 "Tail Monotonic Timestamp: %s\n"
507f22bd
ZJS
2395 "Objects: %"PRIu64"\n"
2396 "Entry Objects: %"PRIu64"\n",
0284adc6
LP
2397 f->path,
2398 sd_id128_to_string(f->header->file_id, a),
2399 sd_id128_to_string(f->header->machine_id, b),
2400 sd_id128_to_string(f->header->boot_id, c),
2765b7bb 2401 sd_id128_to_string(f->header->seqnum_id, d),
3223f44f
LP
2402 f->header->state == STATE_OFFLINE ? "OFFLINE" :
2403 f->header->state == STATE_ONLINE ? "ONLINE" :
2404 f->header->state == STATE_ARCHIVED ? "ARCHIVED" : "UNKNOWN",
8088cbd3 2405 JOURNAL_HEADER_SEALED(f->header) ? " SEALED" : "",
d89c8fdf
ZJS
2406 (le32toh(f->header->compatible_flags) & ~HEADER_COMPATIBLE_ANY) ? " ???" : "",
2407 JOURNAL_HEADER_COMPRESSED_XZ(f->header) ? " COMPRESSED-XZ" : "",
2408 JOURNAL_HEADER_COMPRESSED_LZ4(f->header) ? " COMPRESSED-LZ4" : "",
2409 (le32toh(f->header->incompatible_flags) & ~HEADER_INCOMPATIBLE_ANY) ? " ???" : "",
507f22bd
ZJS
2410 le64toh(f->header->header_size),
2411 le64toh(f->header->arena_size),
2412 le64toh(f->header->data_hash_table_size) / sizeof(HashItem),
2413 le64toh(f->header->field_hash_table_size) / sizeof(HashItem),
fb0951b0 2414 yes_no(journal_file_rotate_suggested(f, 0)),
507f22bd
ZJS
2415 le64toh(f->header->head_entry_seqnum),
2416 le64toh(f->header->tail_entry_seqnum),
718fe4b1
ZJS
2417 format_timestamp_safe(x, sizeof(x), le64toh(f->header->head_entry_realtime)),
2418 format_timestamp_safe(y, sizeof(y), le64toh(f->header->tail_entry_realtime)),
ed375beb 2419 format_timespan(z, sizeof(z), le64toh(f->header->tail_entry_monotonic), USEC_PER_MSEC),
507f22bd
ZJS
2420 le64toh(f->header->n_objects),
2421 le64toh(f->header->n_entries));
7560fffc 2422
0284adc6 2423 if (JOURNAL_HEADER_CONTAINS(f->header, n_data))
507f22bd 2424 printf("Data Objects: %"PRIu64"\n"
0284adc6 2425 "Data Hash Table Fill: %.1f%%\n",
507f22bd 2426 le64toh(f->header->n_data),
0284adc6 2427 100.0 * (double) le64toh(f->header->n_data) / ((double) (le64toh(f->header->data_hash_table_size) / sizeof(HashItem))));
7560fffc 2428
0284adc6 2429 if (JOURNAL_HEADER_CONTAINS(f->header, n_fields))
507f22bd 2430 printf("Field Objects: %"PRIu64"\n"
0284adc6 2431 "Field Hash Table Fill: %.1f%%\n",
507f22bd 2432 le64toh(f->header->n_fields),
0284adc6 2433 100.0 * (double) le64toh(f->header->n_fields) / ((double) (le64toh(f->header->field_hash_table_size) / sizeof(HashItem))));
3223f44f
LP
2434
2435 if (JOURNAL_HEADER_CONTAINS(f->header, n_tags))
507f22bd
ZJS
2436 printf("Tag Objects: %"PRIu64"\n",
2437 le64toh(f->header->n_tags));
3223f44f 2438 if (JOURNAL_HEADER_CONTAINS(f->header, n_entry_arrays))
507f22bd
ZJS
2439 printf("Entry Array Objects: %"PRIu64"\n",
2440 le64toh(f->header->n_entry_arrays));
a1a03e30
LP
2441
2442 if (fstat(f->fd, &st) >= 0)
2443 printf("Disk usage: %s\n", format_bytes(bytes, sizeof(bytes), (off_t) st.st_blocks * 512ULL));
7560fffc
LP
2444}
2445
0284adc6
LP
2446int journal_file_open(
2447 const char *fname,
2448 int flags,
2449 mode_t mode,
2450 bool compress,
baed47c3 2451 bool seal,
0284adc6
LP
2452 JournalMetrics *metrics,
2453 MMapCache *mmap_cache,
2454 JournalFile *template,
2455 JournalFile **ret) {
7560fffc 2456
0284adc6
LP
2457 JournalFile *f;
2458 int r;
2459 bool newly_created = false;
7560fffc 2460
0284adc6 2461 assert(fname);
0559d3a5 2462 assert(ret);
7560fffc 2463
0284adc6
LP
2464 if ((flags & O_ACCMODE) != O_RDONLY &&
2465 (flags & O_ACCMODE) != O_RDWR)
2466 return -EINVAL;
7560fffc 2467
a0108012
LP
2468 if (!endswith(fname, ".journal") &&
2469 !endswith(fname, ".journal~"))
0284adc6 2470 return -EINVAL;
7560fffc 2471
0284adc6
LP
2472 f = new0(JournalFile, 1);
2473 if (!f)
2474 return -ENOMEM;
7560fffc 2475
0284adc6
LP
2476 f->fd = -1;
2477 f->mode = mode;
7560fffc 2478
0284adc6
LP
2479 f->flags = flags;
2480 f->prot = prot_from_flags(flags);
2481 f->writable = (flags & O_ACCMODE) != O_RDONLY;
92261977 2482#if defined(HAVE_LZ4)
d89c8fdf
ZJS
2483 f->compress_lz4 = compress;
2484#elif defined(HAVE_XZ)
2485 f->compress_xz = compress;
48b61739 2486#endif
49a32d43 2487#ifdef HAVE_GCRYPT
baed47c3 2488 f->seal = seal;
49a32d43 2489#endif
7560fffc 2490
0284adc6
LP
2491 if (mmap_cache)
2492 f->mmap = mmap_cache_ref(mmap_cache);
2493 else {
84168d80 2494 f->mmap = mmap_cache_new();
0284adc6
LP
2495 if (!f->mmap) {
2496 r = -ENOMEM;
2497 goto fail;
2498 }
2499 }
7560fffc 2500
0284adc6
LP
2501 f->path = strdup(fname);
2502 if (!f->path) {
2503 r = -ENOMEM;
2504 goto fail;
2505 }
7560fffc 2506
4743015d 2507 f->chain_cache = ordered_hashmap_new(&uint64_hash_ops);
a4bcff5b
LP
2508 if (!f->chain_cache) {
2509 r = -ENOMEM;
2510 goto fail;
2511 }
2512
0284adc6
LP
2513 f->fd = open(f->path, f->flags|O_CLOEXEC, f->mode);
2514 if (f->fd < 0) {
2515 r = -errno;
2516 goto fail;
7560fffc 2517 }
7560fffc 2518
0284adc6
LP
2519 if (fstat(f->fd, &f->last_stat) < 0) {
2520 r = -errno;
2521 goto fail;
2522 }
7560fffc 2523
0284adc6 2524 if (f->last_stat.st_size == 0 && f->writable) {
fb0951b0
LP
2525 uint64_t crtime;
2526
2527 /* Let's attach the creation time to the journal file,
2528 * so that the vacuuming code knows the age of this
2529 * file even if the file might end up corrupted one
2530 * day... Ideally we'd just use the creation time many
2531 * file systems maintain for each file, but there is
2532 * currently no usable API to query this, hence let's
2533 * emulate this via extended attributes. If extended
2534 * attributes are not supported we'll just skip this,
7517e174 2535 * and rely solely on mtime/atime/ctime of the file. */
fb0951b0
LP
2536
2537 crtime = htole64((uint64_t) now(CLOCK_REALTIME));
2538 fsetxattr(f->fd, "user.crtime_usec", &crtime, sizeof(crtime), XATTR_CREATE);
7560fffc 2539
feb12d3e 2540#ifdef HAVE_GCRYPT
0284adc6 2541 /* Try to load the FSPRG state, and if we can't, then
baed47c3 2542 * just don't do sealing */
49a32d43
LP
2543 if (f->seal) {
2544 r = journal_file_fss_load(f);
2545 if (r < 0)
2546 f->seal = false;
2547 }
feb12d3e 2548#endif
7560fffc 2549
0284adc6
LP
2550 r = journal_file_init_header(f, template);
2551 if (r < 0)
2552 goto fail;
7560fffc 2553
0284adc6
LP
2554 if (fstat(f->fd, &f->last_stat) < 0) {
2555 r = -errno;
2556 goto fail;
2557 }
fb0951b0
LP
2558
2559 newly_created = true;
0284adc6 2560 }
7560fffc 2561
0284adc6
LP
2562 if (f->last_stat.st_size < (off_t) HEADER_SIZE_MIN) {
2563 r = -EIO;
2564 goto fail;
2565 }
7560fffc 2566
0284adc6
LP
2567 f->header = mmap(NULL, PAGE_ALIGN(sizeof(Header)), prot_from_flags(flags), MAP_SHARED, f->fd, 0);
2568 if (f->header == MAP_FAILED) {
2569 f->header = NULL;
2570 r = -errno;
2571 goto fail;
2572 }
7560fffc 2573
0284adc6
LP
2574 if (!newly_created) {
2575 r = journal_file_verify_header(f);
2576 if (r < 0)
2577 goto fail;
2578 }
7560fffc 2579
feb12d3e 2580#ifdef HAVE_GCRYPT
0284adc6 2581 if (!newly_created && f->writable) {
baed47c3 2582 r = journal_file_fss_load(f);
0284adc6
LP
2583 if (r < 0)
2584 goto fail;
2585 }
feb12d3e 2586#endif
cec736d2
LP
2587
2588 if (f->writable) {
4a92baf3
LP
2589 if (metrics) {
2590 journal_default_metrics(metrics, f->fd);
2591 f->metrics = *metrics;
2592 } else if (template)
2593 f->metrics = template->metrics;
2594
cec736d2
LP
2595 r = journal_file_refresh_header(f);
2596 if (r < 0)
2597 goto fail;
2598 }
2599
feb12d3e 2600#ifdef HAVE_GCRYPT
baed47c3 2601 r = journal_file_hmac_setup(f);
14d10188
LP
2602 if (r < 0)
2603 goto fail;
feb12d3e 2604#endif
14d10188 2605
cec736d2 2606 if (newly_created) {
de190aef 2607 r = journal_file_setup_field_hash_table(f);
cec736d2
LP
2608 if (r < 0)
2609 goto fail;
2610
de190aef 2611 r = journal_file_setup_data_hash_table(f);
cec736d2
LP
2612 if (r < 0)
2613 goto fail;
7560fffc 2614
feb12d3e 2615#ifdef HAVE_GCRYPT
7560fffc
LP
2616 r = journal_file_append_first_tag(f);
2617 if (r < 0)
2618 goto fail;
feb12d3e 2619#endif
cec736d2
LP
2620 }
2621
de190aef 2622 r = journal_file_map_field_hash_table(f);
cec736d2
LP
2623 if (r < 0)
2624 goto fail;
2625
de190aef 2626 r = journal_file_map_data_hash_table(f);
cec736d2
LP
2627 if (r < 0)
2628 goto fail;
2629
0559d3a5 2630 *ret = f;
cec736d2
LP
2631 return 0;
2632
2633fail:
2634 journal_file_close(f);
2635
2636 return r;
2637}
0ac38b70 2638
baed47c3 2639int journal_file_rotate(JournalFile **f, bool compress, bool seal) {
57535f47 2640 _cleanup_free_ char *p = NULL;
0ac38b70
LP
2641 size_t l;
2642 JournalFile *old_file, *new_file = NULL;
2643 int r;
2644
2645 assert(f);
2646 assert(*f);
2647
2648 old_file = *f;
2649
2650 if (!old_file->writable)
2651 return -EINVAL;
2652
2653 if (!endswith(old_file->path, ".journal"))
2654 return -EINVAL;
2655
2656 l = strlen(old_file->path);
57535f47
ZJS
2657 r = asprintf(&p, "%.*s@" SD_ID128_FORMAT_STR "-%016"PRIx64"-%016"PRIx64".journal",
2658 (int) l - 8, old_file->path,
2659 SD_ID128_FORMAT_VAL(old_file->header->seqnum_id),
2660 le64toh((*f)->header->head_entry_seqnum),
2661 le64toh((*f)->header->head_entry_realtime));
2662 if (r < 0)
0ac38b70
LP
2663 return -ENOMEM;
2664
0ac38b70 2665 r = rename(old_file->path, p);
0ac38b70
LP
2666 if (r < 0)
2667 return -errno;
2668
ccdbaf91 2669 old_file->header->state = STATE_ARCHIVED;
0ac38b70 2670
baed47c3 2671 r = journal_file_open(old_file->path, old_file->flags, old_file->mode, compress, seal, NULL, old_file->mmap, old_file, &new_file);
0ac38b70
LP
2672 journal_file_close(old_file);
2673
2674 *f = new_file;
2675 return r;
2676}
2677
9447a7f1
LP
2678int journal_file_open_reliably(
2679 const char *fname,
2680 int flags,
2681 mode_t mode,
7560fffc 2682 bool compress,
baed47c3 2683 bool seal,
4a92baf3 2684 JournalMetrics *metrics,
27370278 2685 MMapCache *mmap_cache,
9447a7f1
LP
2686 JournalFile *template,
2687 JournalFile **ret) {
2688
2689 int r;
2690 size_t l;
ed375beb 2691 _cleanup_free_ char *p = NULL;
9447a7f1 2692
baed47c3 2693 r = journal_file_open(fname, flags, mode, compress, seal,
27370278 2694 metrics, mmap_cache, template, ret);
0071d9f1
LP
2695 if (r != -EBADMSG && /* corrupted */
2696 r != -ENODATA && /* truncated */
2697 r != -EHOSTDOWN && /* other machine */
a1a1898f
LP
2698 r != -EPROTONOSUPPORT && /* incompatible feature */
2699 r != -EBUSY && /* unclean shutdown */
2700 r != -ESHUTDOWN /* already archived */)
9447a7f1
LP
2701 return r;
2702
2703 if ((flags & O_ACCMODE) == O_RDONLY)
2704 return r;
2705
2706 if (!(flags & O_CREAT))
2707 return r;
2708
7560fffc
LP
2709 if (!endswith(fname, ".journal"))
2710 return r;
2711
5c70eab4
LP
2712 /* The file is corrupted. Rotate it away and try it again (but only once) */
2713
9447a7f1 2714 l = strlen(fname);
9bf3b535 2715 if (asprintf(&p, "%.*s@%016llx-%016" PRIx64 ".journal~",
57535f47 2716 (int) l - 8, fname,
9447a7f1 2717 (unsigned long long) now(CLOCK_REALTIME),
9bf3b535 2718 random_u64()) < 0)
9447a7f1
LP
2719 return -ENOMEM;
2720
2721 r = rename(fname, p);
9447a7f1
LP
2722 if (r < 0)
2723 return -errno;
2724
a1a1898f 2725 log_warning("File %s corrupted or uncleanly shut down, renaming and replacing.", fname);
9447a7f1 2726
baed47c3 2727 return journal_file_open(fname, flags, mode, compress, seal,
27370278 2728 metrics, mmap_cache, template, ret);
9447a7f1
LP
2729}
2730
cf244689
LP
2731int journal_file_copy_entry(JournalFile *from, JournalFile *to, Object *o, uint64_t p, uint64_t *seqnum, Object **ret, uint64_t *offset) {
2732 uint64_t i, n;
2733 uint64_t q, xor_hash = 0;
2734 int r;
2735 EntryItem *items;
2736 dual_timestamp ts;
2737
2738 assert(from);
2739 assert(to);
2740 assert(o);
2741 assert(p);
2742
2743 if (!to->writable)
2744 return -EPERM;
2745
2746 ts.monotonic = le64toh(o->entry.monotonic);
2747 ts.realtime = le64toh(o->entry.realtime);
2748
cf244689 2749 n = journal_file_entry_n_items(o);
4faa7004
TA
2750 /* alloca() can't take 0, hence let's allocate at least one */
2751 items = alloca(sizeof(EntryItem) * MAX(1u, n));
cf244689
LP
2752
2753 for (i = 0; i < n; i++) {
4fd052ae
FC
2754 uint64_t l, h;
2755 le64_t le_hash;
cf244689
LP
2756 size_t t;
2757 void *data;
2758 Object *u;
2759
2760 q = le64toh(o->entry.items[i].object_offset);
2761 le_hash = o->entry.items[i].hash;
2762
2763 r = journal_file_move_to_object(from, OBJECT_DATA, q, &o);
2764 if (r < 0)
2765 return r;
2766
2767 if (le_hash != o->data.hash)
2768 return -EBADMSG;
2769
2770 l = le64toh(o->object.size) - offsetof(Object, data.payload);
2771 t = (size_t) l;
2772
2773 /* We hit the limit on 32bit machines */
2774 if ((uint64_t) t != l)
2775 return -E2BIG;
2776
d89c8fdf 2777 if (o->object.flags & OBJECT_COMPRESSION_MASK) {
3b1a55e1 2778#if defined(HAVE_XZ) || defined(HAVE_LZ4)
fa1c4b51 2779 size_t rsize;
cf244689 2780
d89c8fdf
ZJS
2781 r = decompress_blob(o->object.flags & OBJECT_COMPRESSION_MASK,
2782 o->data.payload, l, &from->compress_buffer, &from->compress_buffer_size, &rsize, 0);
2783 if (r < 0)
2784 return r;
cf244689
LP
2785
2786 data = from->compress_buffer;
2787 l = rsize;
3b1a55e1
ZJS
2788#else
2789 return -EPROTONOSUPPORT;
2790#endif
cf244689
LP
2791 } else
2792 data = o->data.payload;
2793
2794 r = journal_file_append_data(to, data, l, &u, &h);
2795 if (r < 0)
2796 return r;
2797
2798 xor_hash ^= le64toh(u->data.hash);
2799 items[i].object_offset = htole64(h);
2800 items[i].hash = u->data.hash;
2801
2802 r = journal_file_move_to_object(from, OBJECT_ENTRY, p, &o);
2803 if (r < 0)
2804 return r;
2805 }
2806
2807 return journal_file_append_entry_internal(to, &ts, xor_hash, items, n, seqnum, ret, offset);
2808}
babfc091
LP
2809
2810void journal_default_metrics(JournalMetrics *m, int fd) {
2811 uint64_t fs_size = 0;
2812 struct statvfs ss;
a7bc2c2a 2813 char a[FORMAT_BYTES_MAX], b[FORMAT_BYTES_MAX], c[FORMAT_BYTES_MAX], d[FORMAT_BYTES_MAX];
babfc091
LP
2814
2815 assert(m);
2816 assert(fd >= 0);
2817
2818 if (fstatvfs(fd, &ss) >= 0)
2819 fs_size = ss.f_frsize * ss.f_blocks;
2820
2821 if (m->max_use == (uint64_t) -1) {
2822
2823 if (fs_size > 0) {
2824 m->max_use = PAGE_ALIGN(fs_size / 10); /* 10% of file system size */
2825
2826 if (m->max_use > DEFAULT_MAX_USE_UPPER)
2827 m->max_use = DEFAULT_MAX_USE_UPPER;
2828
2829 if (m->max_use < DEFAULT_MAX_USE_LOWER)
2830 m->max_use = DEFAULT_MAX_USE_LOWER;
2831 } else
2832 m->max_use = DEFAULT_MAX_USE_LOWER;
2833 } else {
2834 m->max_use = PAGE_ALIGN(m->max_use);
2835
2836 if (m->max_use < JOURNAL_FILE_SIZE_MIN*2)
2837 m->max_use = JOURNAL_FILE_SIZE_MIN*2;
2838 }
2839
2840 if (m->max_size == (uint64_t) -1) {
2841 m->max_size = PAGE_ALIGN(m->max_use / 8); /* 8 chunks */
2842
2843 if (m->max_size > DEFAULT_MAX_SIZE_UPPER)
2844 m->max_size = DEFAULT_MAX_SIZE_UPPER;
2845 } else
2846 m->max_size = PAGE_ALIGN(m->max_size);
2847
2848 if (m->max_size < JOURNAL_FILE_SIZE_MIN)
2849 m->max_size = JOURNAL_FILE_SIZE_MIN;
2850
2851 if (m->max_size*2 > m->max_use)
2852 m->max_use = m->max_size*2;
2853
2854 if (m->min_size == (uint64_t) -1)
2855 m->min_size = JOURNAL_FILE_SIZE_MIN;
2856 else {
2857 m->min_size = PAGE_ALIGN(m->min_size);
2858
2859 if (m->min_size < JOURNAL_FILE_SIZE_MIN)
2860 m->min_size = JOURNAL_FILE_SIZE_MIN;
2861
2862 if (m->min_size > m->max_size)
2863 m->max_size = m->min_size;
2864 }
2865
2866 if (m->keep_free == (uint64_t) -1) {
2867
2868 if (fs_size > 0) {
8621b110 2869 m->keep_free = PAGE_ALIGN(fs_size * 3 / 20); /* 15% of file system size */
babfc091
LP
2870
2871 if (m->keep_free > DEFAULT_KEEP_FREE_UPPER)
2872 m->keep_free = DEFAULT_KEEP_FREE_UPPER;
2873
2874 } else
2875 m->keep_free = DEFAULT_KEEP_FREE;
2876 }
2877
2b43f939
LP
2878 log_debug("Fixed max_use=%s max_size=%s min_size=%s keep_free=%s",
2879 format_bytes(a, sizeof(a), m->max_use),
2880 format_bytes(b, sizeof(b), m->max_size),
2881 format_bytes(c, sizeof(c), m->min_size),
2882 format_bytes(d, sizeof(d), m->keep_free));
babfc091 2883}
08984293
LP
2884
2885int journal_file_get_cutoff_realtime_usec(JournalFile *f, usec_t *from, usec_t *to) {
08984293
LP
2886 assert(f);
2887 assert(from || to);
2888
2889 if (from) {
162566a4
LP
2890 if (f->header->head_entry_realtime == 0)
2891 return -ENOENT;
08984293 2892
162566a4 2893 *from = le64toh(f->header->head_entry_realtime);
08984293
LP
2894 }
2895
2896 if (to) {
162566a4
LP
2897 if (f->header->tail_entry_realtime == 0)
2898 return -ENOENT;
08984293 2899
162566a4 2900 *to = le64toh(f->header->tail_entry_realtime);
08984293
LP
2901 }
2902
2903 return 1;
2904}
2905
2906int journal_file_get_cutoff_monotonic_usec(JournalFile *f, sd_id128_t boot_id, usec_t *from, usec_t *to) {
08984293
LP
2907 Object *o;
2908 uint64_t p;
2909 int r;
2910
2911 assert(f);
2912 assert(from || to);
2913
47838ab3 2914 r = find_data_object_by_boot_id(f, boot_id, &o, &p);
08984293
LP
2915 if (r <= 0)
2916 return r;
2917
2918 if (le64toh(o->data.n_entries) <= 0)
2919 return 0;
2920
2921 if (from) {
2922 r = journal_file_move_to_object(f, OBJECT_ENTRY, le64toh(o->data.entry_offset), &o);
2923 if (r < 0)
2924 return r;
2925
2926 *from = le64toh(o->entry.monotonic);
2927 }
2928
2929 if (to) {
2930 r = journal_file_move_to_object(f, OBJECT_DATA, p, &o);
2931 if (r < 0)
2932 return r;
2933
2934 r = generic_array_get_plus_one(f,
2935 le64toh(o->data.entry_offset),
2936 le64toh(o->data.entry_array_offset),
2937 le64toh(o->data.n_entries)-1,
2938 &o, NULL);
2939 if (r <= 0)
2940 return r;
2941
2942 *to = le64toh(o->entry.monotonic);
2943 }
2944
2945 return 1;
2946}
dca6219e 2947
fb0951b0 2948bool journal_file_rotate_suggested(JournalFile *f, usec_t max_file_usec) {
dca6219e
LP
2949 assert(f);
2950
2951 /* If we gained new header fields we gained new features,
2952 * hence suggest a rotation */
361f9cbc
LP
2953 if (le64toh(f->header->header_size) < sizeof(Header)) {
2954 log_debug("%s uses an outdated header, suggesting rotation.", f->path);
dca6219e 2955 return true;
361f9cbc 2956 }
dca6219e
LP
2957
2958 /* Let's check if the hash tables grew over a certain fill
2959 * level (75%, borrowing this value from Java's hash table
2960 * implementation), and if so suggest a rotation. To calculate
2961 * the fill level we need the n_data field, which only exists
2962 * in newer versions. */
2963
2964 if (JOURNAL_HEADER_CONTAINS(f->header, n_data))
361f9cbc 2965 if (le64toh(f->header->n_data) * 4ULL > (le64toh(f->header->data_hash_table_size) / sizeof(HashItem)) * 3ULL) {
507f22bd 2966 log_debug("Data hash table of %s has a fill level at %.1f (%"PRIu64" of %"PRIu64" items, %llu file size, %"PRIu64" bytes per hash table item), suggesting rotation.",
361f9cbc
LP
2967 f->path,
2968 100.0 * (double) le64toh(f->header->n_data) / ((double) (le64toh(f->header->data_hash_table_size) / sizeof(HashItem))),
507f22bd
ZJS
2969 le64toh(f->header->n_data),
2970 le64toh(f->header->data_hash_table_size) / sizeof(HashItem),
2971 (unsigned long long) f->last_stat.st_size,
2972 f->last_stat.st_size / le64toh(f->header->n_data));
dca6219e 2973 return true;
361f9cbc 2974 }
dca6219e
LP
2975
2976 if (JOURNAL_HEADER_CONTAINS(f->header, n_fields))
361f9cbc 2977 if (le64toh(f->header->n_fields) * 4ULL > (le64toh(f->header->field_hash_table_size) / sizeof(HashItem)) * 3ULL) {
507f22bd 2978 log_debug("Field hash table of %s has a fill level at %.1f (%"PRIu64" of %"PRIu64" items), suggesting rotation.",
361f9cbc
LP
2979 f->path,
2980 100.0 * (double) le64toh(f->header->n_fields) / ((double) (le64toh(f->header->field_hash_table_size) / sizeof(HashItem))),
507f22bd
ZJS
2981 le64toh(f->header->n_fields),
2982 le64toh(f->header->field_hash_table_size) / sizeof(HashItem));
dca6219e 2983 return true;
361f9cbc 2984 }
dca6219e 2985
0598fd4a
LP
2986 /* Are the data objects properly indexed by field objects? */
2987 if (JOURNAL_HEADER_CONTAINS(f->header, n_data) &&
2988 JOURNAL_HEADER_CONTAINS(f->header, n_fields) &&
2989 le64toh(f->header->n_data) > 0 &&
2990 le64toh(f->header->n_fields) == 0)
2991 return true;
2992
fb0951b0
LP
2993 if (max_file_usec > 0) {
2994 usec_t t, h;
2995
2996 h = le64toh(f->header->head_entry_realtime);
2997 t = now(CLOCK_REALTIME);
2998
2999 if (h > 0 && t > h + max_file_usec)
3000 return true;
3001 }
3002
dca6219e
LP
3003 return false;
3004}