]> git.ipfire.org Git - thirdparty/systemd.git/blame - src/journal/journal-file.c
journalctl: static variables immediately configured via command line arguments should...
[thirdparty/systemd.git] / src / journal / journal-file.c
CommitLineData
cec736d2
LP
1/*-*- Mode: C; c-basic-offset: 8; indent-tabs-mode: nil -*-*/
2
3/***
4 This file is part of systemd.
5
6 Copyright 2011 Lennart Poettering
7
8 systemd is free software; you can redistribute it and/or modify it
5430f7f2
LP
9 under the terms of the GNU Lesser General Public License as published by
10 the Free Software Foundation; either version 2.1 of the License, or
cec736d2
LP
11 (at your option) any later version.
12
13 systemd is distributed in the hope that it will be useful, but
14 WITHOUT ANY WARRANTY; without even the implied warranty of
15 MERCHANTABILITY or FITNESS FOR A PARTICULAR PURPOSE. See the GNU
5430f7f2 16 Lesser General Public License for more details.
cec736d2 17
5430f7f2 18 You should have received a copy of the GNU Lesser General Public License
cec736d2
LP
19 along with systemd; If not, see <http://www.gnu.org/licenses/>.
20***/
21
22#include <sys/mman.h>
23#include <errno.h>
24#include <sys/uio.h>
25#include <unistd.h>
26#include <sys/statvfs.h>
27#include <fcntl.h>
28#include <stddef.h>
fb0951b0 29
cec736d2
LP
30#include "journal-def.h"
31#include "journal-file.h"
0284adc6 32#include "journal-authenticate.h"
cec736d2 33#include "lookup3.h"
807e17f0 34#include "compress.h"
7560fffc 35#include "fsprg.h"
cec736d2 36
4a92baf3
LP
37#define DEFAULT_DATA_HASH_TABLE_SIZE (2047ULL*sizeof(HashItem))
38#define DEFAULT_FIELD_HASH_TABLE_SIZE (333ULL*sizeof(HashItem))
cec736d2 39
be19b7df 40#define COMPRESSION_SIZE_THRESHOLD (512ULL)
807e17f0 41
babfc091 42/* This is the minimum journal file size */
253f59df 43#define JOURNAL_FILE_SIZE_MIN (4ULL*1024ULL*1024ULL) /* 4 MiB */
babfc091
LP
44
45/* These are the lower and upper bounds if we deduce the max_use value
46 * from the file system size */
47#define DEFAULT_MAX_USE_LOWER (1ULL*1024ULL*1024ULL) /* 1 MiB */
48#define DEFAULT_MAX_USE_UPPER (4ULL*1024ULL*1024ULL*1024ULL) /* 4 GiB */
49
50/* This is the upper bound if we deduce max_size from max_use */
71100051 51#define DEFAULT_MAX_SIZE_UPPER (128ULL*1024ULL*1024ULL) /* 128 MiB */
babfc091
LP
52
53/* This is the upper bound if we deduce the keep_free value from the
54 * file system size */
55#define DEFAULT_KEEP_FREE_UPPER (4ULL*1024ULL*1024ULL*1024ULL) /* 4 GiB */
56
57/* This is the keep_free value when we can't determine the system
58 * size */
59#define DEFAULT_KEEP_FREE (1024ULL*1024ULL) /* 1 MB */
60
dca6219e
LP
61/* n_data was the first entry we added after the initial file format design */
62#define HEADER_SIZE_MIN ALIGN64(offsetof(Header, n_data))
cec736d2 63
a4bcff5b
LP
64/* How many entries to keep in the entry array chain cache at max */
65#define CHAIN_CACHE_MAX 20
66
a676e665
LP
67/* How much to increase the journal file size at once each time we allocate something new. */
68#define FILE_SIZE_INCREASE (8ULL*1024ULL*1024ULL) /* 8MB */
69
fa6ac760
LP
70/* The mmap context to use for the header we pick as one above the last defined typed */
71#define CONTEXT_HEADER _OBJECT_TYPE_MAX
72
9588bc32 73static int journal_file_set_online(JournalFile *f) {
26687bf8
OS
74 assert(f);
75
76 if (!f->writable)
77 return -EPERM;
78
79 if (!(f->fd >= 0 && f->header))
80 return -EINVAL;
81
fa6ac760
LP
82 if (mmap_cache_got_sigbus(f->mmap, f->fd))
83 return -EIO;
84
26687bf8
OS
85 switch(f->header->state) {
86 case STATE_ONLINE:
87 return 0;
88
89 case STATE_OFFLINE:
90 f->header->state = STATE_ONLINE;
91 fsync(f->fd);
92 return 0;
93
94 default:
95 return -EINVAL;
96 }
97}
98
99int journal_file_set_offline(JournalFile *f) {
100 assert(f);
101
102 if (!f->writable)
103 return -EPERM;
104
105 if (!(f->fd >= 0 && f->header))
106 return -EINVAL;
107
108 if (f->header->state != STATE_ONLINE)
109 return 0;
110
111 fsync(f->fd);
112
fa6ac760
LP
113 if (mmap_cache_got_sigbus(f->mmap, f->fd))
114 return -EIO;
115
26687bf8
OS
116 f->header->state = STATE_OFFLINE;
117
fa6ac760
LP
118 if (mmap_cache_got_sigbus(f->mmap, f->fd))
119 return -EIO;
120
26687bf8
OS
121 fsync(f->fd);
122
123 return 0;
124}
125
cec736d2 126void journal_file_close(JournalFile *f) {
de190aef 127 assert(f);
cec736d2 128
feb12d3e 129#ifdef HAVE_GCRYPT
b0af6f41 130 /* Write the final tag */
c586dbf1 131 if (f->seal && f->writable)
b0af6f41 132 journal_file_append_tag(f);
feb12d3e 133#endif
b0af6f41 134
26687bf8 135 journal_file_set_offline(f);
cec736d2 136
fa6ac760
LP
137 if (f->mmap && f->fd >= 0)
138 mmap_cache_close_fd(f->mmap, f->fd);
cec736d2 139
03e334a1 140 safe_close(f->fd);
cec736d2 141 free(f->path);
807e17f0 142
16e9f408
LP
143 if (f->mmap)
144 mmap_cache_unref(f->mmap);
145
4743015d 146 ordered_hashmap_free_free(f->chain_cache);
a4bcff5b 147
d89c8fdf 148#if defined(HAVE_XZ) || defined(HAVE_LZ4)
807e17f0
LP
149 free(f->compress_buffer);
150#endif
151
7560fffc 152#ifdef HAVE_GCRYPT
baed47c3
LP
153 if (f->fss_file)
154 munmap(f->fss_file, PAGE_ALIGN(f->fss_file_size));
b7c9ae91
LP
155 else if (f->fsprg_state)
156 free(f->fsprg_state);
157
158 free(f->fsprg_seed);
7560fffc
LP
159
160 if (f->hmac)
161 gcry_md_close(f->hmac);
162#endif
163
cec736d2
LP
164 free(f);
165}
166
0ac38b70 167static int journal_file_init_header(JournalFile *f, JournalFile *template) {
d89c8fdf 168 Header h = {};
cec736d2
LP
169 ssize_t k;
170 int r;
171
172 assert(f);
173
7560fffc 174 memcpy(h.signature, HEADER_SIGNATURE, 8);
23b0b2b2 175 h.header_size = htole64(ALIGN64(sizeof(h)));
cec736d2 176
d89c8fdf
ZJS
177 h.incompatible_flags |= htole32(
178 f->compress_xz * HEADER_INCOMPATIBLE_COMPRESSED_XZ |
179 f->compress_lz4 * HEADER_INCOMPATIBLE_COMPRESSED_LZ4);
7560fffc 180
d89c8fdf
ZJS
181 h.compatible_flags = htole32(
182 f->seal * HEADER_COMPATIBLE_SEALED);
7560fffc 183
cec736d2
LP
184 r = sd_id128_randomize(&h.file_id);
185 if (r < 0)
186 return r;
187
0ac38b70
LP
188 if (template) {
189 h.seqnum_id = template->header->seqnum_id;
beec0085 190 h.tail_entry_seqnum = template->header->tail_entry_seqnum;
0ac38b70
LP
191 } else
192 h.seqnum_id = h.file_id;
cec736d2
LP
193
194 k = pwrite(f->fd, &h, sizeof(h), 0);
195 if (k < 0)
196 return -errno;
197
198 if (k != sizeof(h))
199 return -EIO;
200
201 return 0;
202}
203
204static int journal_file_refresh_header(JournalFile *f) {
de190aef 205 sd_id128_t boot_id;
fa6ac760 206 int r;
cec736d2
LP
207
208 assert(f);
209
210 r = sd_id128_get_machine(&f->header->machine_id);
211 if (r < 0)
212 return r;
213
de190aef 214 r = sd_id128_get_boot(&boot_id);
cec736d2
LP
215 if (r < 0)
216 return r;
217
de190aef
LP
218 if (sd_id128_equal(boot_id, f->header->boot_id))
219 f->tail_entry_monotonic_valid = true;
220
221 f->header->boot_id = boot_id;
222
fa6ac760 223 r = journal_file_set_online(f);
b788cc23 224
7560fffc 225 /* Sync the online state to disk */
a676e665 226 fsync(f->fd);
b788cc23 227
fa6ac760 228 return r;
cec736d2
LP
229}
230
231static int journal_file_verify_header(JournalFile *f) {
d89c8fdf
ZJS
232 uint32_t flags;
233
cec736d2
LP
234 assert(f);
235
7560fffc 236 if (memcmp(f->header->signature, HEADER_SIGNATURE, 8))
cec736d2
LP
237 return -EBADMSG;
238
7560fffc
LP
239 /* In both read and write mode we refuse to open files with
240 * incompatible flags we don't know */
d89c8fdf
ZJS
241 flags = le32toh(f->header->incompatible_flags);
242 if (flags & ~HEADER_INCOMPATIBLE_SUPPORTED) {
243 if (flags & ~HEADER_INCOMPATIBLE_ANY)
244 log_debug("Journal file %s has unknown incompatible flags %"PRIx32,
245 f->path, flags & ~HEADER_INCOMPATIBLE_ANY);
246 flags = (flags & HEADER_INCOMPATIBLE_ANY) & ~HEADER_INCOMPATIBLE_SUPPORTED;
247 if (flags)
248 log_debug("Journal file %s uses incompatible flags %"PRIx32
249 " disabled at compilation time.", f->path, flags);
cec736d2 250 return -EPROTONOSUPPORT;
d89c8fdf 251 }
cec736d2 252
7560fffc
LP
253 /* When open for writing we refuse to open files with
254 * compatible flags, too */
d89c8fdf
ZJS
255 flags = le32toh(f->header->compatible_flags);
256 if (f->writable && (flags & ~HEADER_COMPATIBLE_SUPPORTED)) {
257 if (flags & ~HEADER_COMPATIBLE_ANY)
258 log_debug("Journal file %s has unknown compatible flags %"PRIx32,
259 f->path, flags & ~HEADER_COMPATIBLE_ANY);
260 flags = (flags & HEADER_COMPATIBLE_ANY) & ~HEADER_COMPATIBLE_SUPPORTED;
261 if (flags)
262 log_debug("Journal file %s uses compatible flags %"PRIx32
263 " disabled at compilation time.", f->path, flags);
264 return -EPROTONOSUPPORT;
7560fffc
LP
265 }
266
db11ac1a
LP
267 if (f->header->state >= _STATE_MAX)
268 return -EBADMSG;
269
dca6219e
LP
270 /* The first addition was n_data, so check that we are at least this large */
271 if (le64toh(f->header->header_size) < HEADER_SIZE_MIN)
23b0b2b2
LP
272 return -EBADMSG;
273
8088cbd3 274 if (JOURNAL_HEADER_SEALED(f->header) && !JOURNAL_HEADER_CONTAINS(f->header, n_entry_arrays))
beec0085
LP
275 return -EBADMSG;
276
db11ac1a
LP
277 if ((le64toh(f->header->header_size) + le64toh(f->header->arena_size)) > (uint64_t) f->last_stat.st_size)
278 return -ENODATA;
279
280 if (le64toh(f->header->tail_object_offset) > (le64toh(f->header->header_size) + le64toh(f->header->arena_size)))
281 return -ENODATA;
282
7762e02b
LP
283 if (!VALID64(le64toh(f->header->data_hash_table_offset)) ||
284 !VALID64(le64toh(f->header->field_hash_table_offset)) ||
285 !VALID64(le64toh(f->header->tail_object_offset)) ||
286 !VALID64(le64toh(f->header->entry_array_offset)))
287 return -ENODATA;
288
cec736d2 289 if (f->writable) {
ccdbaf91 290 uint8_t state;
cec736d2
LP
291 sd_id128_t machine_id;
292 int r;
293
294 r = sd_id128_get_machine(&machine_id);
295 if (r < 0)
296 return r;
297
298 if (!sd_id128_equal(machine_id, f->header->machine_id))
299 return -EHOSTDOWN;
300
de190aef 301 state = f->header->state;
cec736d2 302
71fa6f00
LP
303 if (state == STATE_ONLINE) {
304 log_debug("Journal file %s is already online. Assuming unclean closing.", f->path);
305 return -EBUSY;
306 } else if (state == STATE_ARCHIVED)
cec736d2 307 return -ESHUTDOWN;
71fa6f00
LP
308 else if (state != STATE_OFFLINE) {
309 log_debug("Journal file %s has unknown state %u.", f->path, state);
310 return -EBUSY;
311 }
cec736d2
LP
312 }
313
d89c8fdf
ZJS
314 f->compress_xz = JOURNAL_HEADER_COMPRESSED_XZ(f->header);
315 f->compress_lz4 = JOURNAL_HEADER_COMPRESSED_LZ4(f->header);
c586dbf1 316
f1889c91 317 f->seal = JOURNAL_HEADER_SEALED(f->header);
7560fffc 318
cec736d2
LP
319 return 0;
320}
321
322static int journal_file_allocate(JournalFile *f, uint64_t offset, uint64_t size) {
eda4b58b 323 uint64_t old_size, new_size;
fec2aa2f 324 int r;
cec736d2
LP
325
326 assert(f);
327
cec736d2 328 /* We assume that this file is not sparse, and we know that
38ac38b2 329 * for sure, since we always call posix_fallocate()
cec736d2
LP
330 * ourselves */
331
fa6ac760
LP
332 if (mmap_cache_got_sigbus(f->mmap, f->fd))
333 return -EIO;
334
cec736d2 335 old_size =
23b0b2b2 336 le64toh(f->header->header_size) +
cec736d2
LP
337 le64toh(f->header->arena_size);
338
bc85bfee 339 new_size = PAGE_ALIGN(offset + size);
23b0b2b2
LP
340 if (new_size < le64toh(f->header->header_size))
341 new_size = le64toh(f->header->header_size);
bc85bfee
LP
342
343 if (new_size <= old_size)
cec736d2
LP
344 return 0;
345
a676e665 346 if (f->metrics.max_size > 0 && new_size > f->metrics.max_size)
bc85bfee 347 return -E2BIG;
cec736d2 348
a676e665 349 if (new_size > f->metrics.min_size && f->metrics.keep_free > 0) {
cec736d2
LP
350 struct statvfs svfs;
351
352 if (fstatvfs(f->fd, &svfs) >= 0) {
353 uint64_t available;
354
355 available = svfs.f_bfree * svfs.f_bsize;
356
bc85bfee
LP
357 if (available >= f->metrics.keep_free)
358 available -= f->metrics.keep_free;
cec736d2
LP
359 else
360 available = 0;
361
362 if (new_size - old_size > available)
363 return -E2BIG;
364 }
365 }
366
eda4b58b
LP
367 /* Increase by larger blocks at once */
368 new_size = ((new_size+FILE_SIZE_INCREASE-1) / FILE_SIZE_INCREASE) * FILE_SIZE_INCREASE;
369 if (f->metrics.max_size > 0 && new_size > f->metrics.max_size)
370 new_size = f->metrics.max_size;
371
bc85bfee
LP
372 /* Note that the glibc fallocate() fallback is very
373 inefficient, hence we try to minimize the allocation area
374 as we can. */
fec2aa2f
GV
375 r = posix_fallocate(f->fd, old_size, new_size - old_size);
376 if (r != 0)
377 return -r;
cec736d2 378
eda4b58b
LP
379 if (fstat(f->fd, &f->last_stat) < 0)
380 return -errno;
cec736d2 381
23b0b2b2 382 f->header->arena_size = htole64(new_size - le64toh(f->header->header_size));
cec736d2
LP
383
384 return 0;
385}
386
78519831 387static unsigned type_to_context(ObjectType type) {
d3d3208f 388 /* One context for each type, plus one catch-all for the rest */
69adae51 389 assert_cc(_OBJECT_TYPE_MAX <= MMAP_CACHE_MAX_CONTEXTS);
fa6ac760 390 assert_cc(CONTEXT_HEADER < MMAP_CACHE_MAX_CONTEXTS);
d05089d8 391 return type > OBJECT_UNUSED && type < _OBJECT_TYPE_MAX ? type : 0;
d3d3208f
MS
392}
393
7a9dabea 394static int journal_file_move_to(JournalFile *f, ObjectType type, bool keep_always, uint64_t offset, uint64_t size, void **ret) {
cec736d2 395 assert(f);
cec736d2
LP
396 assert(ret);
397
7762e02b
LP
398 if (size <= 0)
399 return -EINVAL;
400
2a59ea54 401 /* Avoid SIGBUS on invalid accesses */
4bbdcdb3
LP
402 if (offset + size > (uint64_t) f->last_stat.st_size) {
403 /* Hmm, out of range? Let's refresh the fstat() data
404 * first, before we trust that check. */
405
406 if (fstat(f->fd, &f->last_stat) < 0 ||
407 offset + size > (uint64_t) f->last_stat.st_size)
408 return -EADDRNOTAVAIL;
409 }
410
7a9dabea 411 return mmap_cache_get(f->mmap, f->fd, f->prot, type_to_context(type), keep_always, offset, size, &f->last_stat, ret);
cec736d2
LP
412}
413
16e9f408
LP
414static uint64_t minimum_header_size(Object *o) {
415
b8e891e6 416 static const uint64_t table[] = {
16e9f408
LP
417 [OBJECT_DATA] = sizeof(DataObject),
418 [OBJECT_FIELD] = sizeof(FieldObject),
419 [OBJECT_ENTRY] = sizeof(EntryObject),
420 [OBJECT_DATA_HASH_TABLE] = sizeof(HashTableObject),
421 [OBJECT_FIELD_HASH_TABLE] = sizeof(HashTableObject),
422 [OBJECT_ENTRY_ARRAY] = sizeof(EntryArrayObject),
423 [OBJECT_TAG] = sizeof(TagObject),
424 };
425
426 if (o->object.type >= ELEMENTSOF(table) || table[o->object.type] <= 0)
427 return sizeof(ObjectHeader);
428
429 return table[o->object.type];
430}
431
78519831 432int journal_file_move_to_object(JournalFile *f, ObjectType type, uint64_t offset, Object **ret) {
cec736d2
LP
433 int r;
434 void *t;
435 Object *o;
436 uint64_t s;
437
438 assert(f);
439 assert(ret);
440
db11ac1a
LP
441 /* Objects may only be located at multiple of 64 bit */
442 if (!VALID64(offset))
443 return -EFAULT;
444
7a9dabea 445 r = journal_file_move_to(f, type, false, offset, sizeof(ObjectHeader), &t);
cec736d2
LP
446 if (r < 0)
447 return r;
448
449 o = (Object*) t;
450 s = le64toh(o->object.size);
451
452 if (s < sizeof(ObjectHeader))
453 return -EBADMSG;
454
16e9f408
LP
455 if (o->object.type <= OBJECT_UNUSED)
456 return -EBADMSG;
457
458 if (s < minimum_header_size(o))
459 return -EBADMSG;
460
d05089d8 461 if (type > OBJECT_UNUSED && o->object.type != type)
cec736d2
LP
462 return -EBADMSG;
463
464 if (s > sizeof(ObjectHeader)) {
7a9dabea 465 r = journal_file_move_to(f, type, false, offset, s, &t);
cec736d2
LP
466 if (r < 0)
467 return r;
468
469 o = (Object*) t;
470 }
471
cec736d2
LP
472 *ret = o;
473 return 0;
474}
475
d98cc1f2 476static uint64_t journal_file_entry_seqnum(JournalFile *f, uint64_t *seqnum) {
cec736d2
LP
477 uint64_t r;
478
479 assert(f);
480
beec0085 481 r = le64toh(f->header->tail_entry_seqnum) + 1;
c2373f84
LP
482
483 if (seqnum) {
de190aef 484 /* If an external seqnum counter was passed, we update
c2373f84
LP
485 * both the local and the external one, and set it to
486 * the maximum of both */
487
488 if (*seqnum + 1 > r)
489 r = *seqnum + 1;
490
491 *seqnum = r;
492 }
493
beec0085 494 f->header->tail_entry_seqnum = htole64(r);
cec736d2 495
beec0085
LP
496 if (f->header->head_entry_seqnum == 0)
497 f->header->head_entry_seqnum = htole64(r);
de190aef 498
cec736d2
LP
499 return r;
500}
501
78519831 502int journal_file_append_object(JournalFile *f, ObjectType type, uint64_t size, Object **ret, uint64_t *offset) {
cec736d2
LP
503 int r;
504 uint64_t p;
505 Object *tail, *o;
506 void *t;
507
508 assert(f);
d05089d8 509 assert(type > OBJECT_UNUSED && type < _OBJECT_TYPE_MAX);
cec736d2
LP
510 assert(size >= sizeof(ObjectHeader));
511 assert(offset);
512 assert(ret);
513
26687bf8
OS
514 r = journal_file_set_online(f);
515 if (r < 0)
516 return r;
517
cec736d2 518 p = le64toh(f->header->tail_object_offset);
cec736d2 519 if (p == 0)
23b0b2b2 520 p = le64toh(f->header->header_size);
cec736d2 521 else {
d05089d8 522 r = journal_file_move_to_object(f, OBJECT_UNUSED, p, &tail);
cec736d2
LP
523 if (r < 0)
524 return r;
525
526 p += ALIGN64(le64toh(tail->object.size));
527 }
528
529 r = journal_file_allocate(f, p, size);
530 if (r < 0)
531 return r;
532
fcde2389 533 r = journal_file_move_to(f, type, false, p, size, &t);
cec736d2
LP
534 if (r < 0)
535 return r;
536
537 o = (Object*) t;
538
539 zero(o->object);
de190aef 540 o->object.type = type;
cec736d2
LP
541 o->object.size = htole64(size);
542
543 f->header->tail_object_offset = htole64(p);
cec736d2
LP
544 f->header->n_objects = htole64(le64toh(f->header->n_objects) + 1);
545
546 *ret = o;
547 *offset = p;
548
549 return 0;
550}
551
de190aef 552static int journal_file_setup_data_hash_table(JournalFile *f) {
cec736d2
LP
553 uint64_t s, p;
554 Object *o;
555 int r;
556
557 assert(f);
558
dfabe643 559 /* We estimate that we need 1 hash table entry per 768 of
4a92baf3
LP
560 journal file and we want to make sure we never get beyond
561 75% fill level. Calculate the hash table size for the
562 maximum file size based on these metrics. */
563
dfabe643 564 s = (f->metrics.max_size * 4 / 768 / 3) * sizeof(HashItem);
4a92baf3
LP
565 if (s < DEFAULT_DATA_HASH_TABLE_SIZE)
566 s = DEFAULT_DATA_HASH_TABLE_SIZE;
567
507f22bd 568 log_debug("Reserving %"PRIu64" entries in hash table.", s / sizeof(HashItem));
4a92baf3 569
de190aef
LP
570 r = journal_file_append_object(f,
571 OBJECT_DATA_HASH_TABLE,
572 offsetof(Object, hash_table.items) + s,
573 &o, &p);
cec736d2
LP
574 if (r < 0)
575 return r;
576
29804cc1 577 memzero(o->hash_table.items, s);
cec736d2 578
de190aef
LP
579 f->header->data_hash_table_offset = htole64(p + offsetof(Object, hash_table.items));
580 f->header->data_hash_table_size = htole64(s);
cec736d2
LP
581
582 return 0;
583}
584
de190aef 585static int journal_file_setup_field_hash_table(JournalFile *f) {
cec736d2
LP
586 uint64_t s, p;
587 Object *o;
588 int r;
589
590 assert(f);
591
3c1668da
LP
592 /* We use a fixed size hash table for the fields as this
593 * number should grow very slowly only */
594
de190aef
LP
595 s = DEFAULT_FIELD_HASH_TABLE_SIZE;
596 r = journal_file_append_object(f,
597 OBJECT_FIELD_HASH_TABLE,
598 offsetof(Object, hash_table.items) + s,
599 &o, &p);
cec736d2
LP
600 if (r < 0)
601 return r;
602
29804cc1 603 memzero(o->hash_table.items, s);
cec736d2 604
de190aef
LP
605 f->header->field_hash_table_offset = htole64(p + offsetof(Object, hash_table.items));
606 f->header->field_hash_table_size = htole64(s);
cec736d2
LP
607
608 return 0;
609}
610
de190aef 611static int journal_file_map_data_hash_table(JournalFile *f) {
cec736d2
LP
612 uint64_t s, p;
613 void *t;
614 int r;
615
616 assert(f);
617
de190aef
LP
618 p = le64toh(f->header->data_hash_table_offset);
619 s = le64toh(f->header->data_hash_table_size);
cec736d2 620
de190aef 621 r = journal_file_move_to(f,
16e9f408 622 OBJECT_DATA_HASH_TABLE,
fcde2389 623 true,
de190aef
LP
624 p, s,
625 &t);
cec736d2
LP
626 if (r < 0)
627 return r;
628
de190aef 629 f->data_hash_table = t;
cec736d2
LP
630 return 0;
631}
632
de190aef 633static int journal_file_map_field_hash_table(JournalFile *f) {
cec736d2
LP
634 uint64_t s, p;
635 void *t;
636 int r;
637
638 assert(f);
639
de190aef
LP
640 p = le64toh(f->header->field_hash_table_offset);
641 s = le64toh(f->header->field_hash_table_size);
cec736d2 642
de190aef 643 r = journal_file_move_to(f,
16e9f408 644 OBJECT_FIELD_HASH_TABLE,
fcde2389 645 true,
de190aef
LP
646 p, s,
647 &t);
cec736d2
LP
648 if (r < 0)
649 return r;
650
de190aef 651 f->field_hash_table = t;
cec736d2
LP
652 return 0;
653}
654
3c1668da
LP
655static int journal_file_link_field(
656 JournalFile *f,
657 Object *o,
658 uint64_t offset,
659 uint64_t hash) {
660
661 uint64_t p, h;
662 int r;
663
664 assert(f);
665 assert(o);
666 assert(offset > 0);
667
668 if (o->object.type != OBJECT_FIELD)
669 return -EINVAL;
670
671 /* This might alter the window we are looking at */
672
673 o->field.next_hash_offset = o->field.head_data_offset = 0;
674
675 h = hash % (le64toh(f->header->field_hash_table_size) / sizeof(HashItem));
676 p = le64toh(f->field_hash_table[h].tail_hash_offset);
677 if (p == 0)
678 f->field_hash_table[h].head_hash_offset = htole64(offset);
679 else {
680 r = journal_file_move_to_object(f, OBJECT_FIELD, p, &o);
681 if (r < 0)
682 return r;
683
684 o->field.next_hash_offset = htole64(offset);
685 }
686
687 f->field_hash_table[h].tail_hash_offset = htole64(offset);
688
689 if (JOURNAL_HEADER_CONTAINS(f->header, n_fields))
690 f->header->n_fields = htole64(le64toh(f->header->n_fields) + 1);
691
692 return 0;
693}
694
695static int journal_file_link_data(
696 JournalFile *f,
697 Object *o,
698 uint64_t offset,
699 uint64_t hash) {
700
de190aef 701 uint64_t p, h;
cec736d2
LP
702 int r;
703
704 assert(f);
705 assert(o);
706 assert(offset > 0);
b588975f
LP
707
708 if (o->object.type != OBJECT_DATA)
709 return -EINVAL;
cec736d2 710
48496df6
LP
711 /* This might alter the window we are looking at */
712
de190aef
LP
713 o->data.next_hash_offset = o->data.next_field_offset = 0;
714 o->data.entry_offset = o->data.entry_array_offset = 0;
715 o->data.n_entries = 0;
cec736d2 716
de190aef 717 h = hash % (le64toh(f->header->data_hash_table_size) / sizeof(HashItem));
8db4213e 718 p = le64toh(f->data_hash_table[h].tail_hash_offset);
3c1668da 719 if (p == 0)
cec736d2 720 /* Only entry in the hash table is easy */
de190aef 721 f->data_hash_table[h].head_hash_offset = htole64(offset);
3c1668da 722 else {
48496df6
LP
723 /* Move back to the previous data object, to patch in
724 * pointer */
cec736d2 725
de190aef 726 r = journal_file_move_to_object(f, OBJECT_DATA, p, &o);
cec736d2
LP
727 if (r < 0)
728 return r;
729
de190aef 730 o->data.next_hash_offset = htole64(offset);
cec736d2
LP
731 }
732
de190aef 733 f->data_hash_table[h].tail_hash_offset = htole64(offset);
cec736d2 734
dca6219e
LP
735 if (JOURNAL_HEADER_CONTAINS(f->header, n_data))
736 f->header->n_data = htole64(le64toh(f->header->n_data) + 1);
737
cec736d2
LP
738 return 0;
739}
740
3c1668da
LP
741int journal_file_find_field_object_with_hash(
742 JournalFile *f,
743 const void *field, uint64_t size, uint64_t hash,
744 Object **ret, uint64_t *offset) {
745
746 uint64_t p, osize, h;
747 int r;
748
749 assert(f);
750 assert(field && size > 0);
751
752 osize = offsetof(Object, field.payload) + size;
753
754 if (f->header->field_hash_table_size == 0)
755 return -EBADMSG;
756
757 h = hash % (le64toh(f->header->field_hash_table_size) / sizeof(HashItem));
758 p = le64toh(f->field_hash_table[h].head_hash_offset);
759
760 while (p > 0) {
761 Object *o;
762
763 r = journal_file_move_to_object(f, OBJECT_FIELD, p, &o);
764 if (r < 0)
765 return r;
766
767 if (le64toh(o->field.hash) == hash &&
768 le64toh(o->object.size) == osize &&
769 memcmp(o->field.payload, field, size) == 0) {
770
771 if (ret)
772 *ret = o;
773 if (offset)
774 *offset = p;
775
776 return 1;
777 }
778
779 p = le64toh(o->field.next_hash_offset);
780 }
781
782 return 0;
783}
784
785int journal_file_find_field_object(
786 JournalFile *f,
787 const void *field, uint64_t size,
788 Object **ret, uint64_t *offset) {
789
790 uint64_t hash;
791
792 assert(f);
793 assert(field && size > 0);
794
795 hash = hash64(field, size);
796
797 return journal_file_find_field_object_with_hash(f,
798 field, size, hash,
799 ret, offset);
800}
801
de190aef
LP
802int journal_file_find_data_object_with_hash(
803 JournalFile *f,
804 const void *data, uint64_t size, uint64_t hash,
805 Object **ret, uint64_t *offset) {
48496df6 806
de190aef 807 uint64_t p, osize, h;
cec736d2
LP
808 int r;
809
810 assert(f);
811 assert(data || size == 0);
812
813 osize = offsetof(Object, data.payload) + size;
814
bc85bfee
LP
815 if (f->header->data_hash_table_size == 0)
816 return -EBADMSG;
817
de190aef
LP
818 h = hash % (le64toh(f->header->data_hash_table_size) / sizeof(HashItem));
819 p = le64toh(f->data_hash_table[h].head_hash_offset);
cec736d2 820
de190aef
LP
821 while (p > 0) {
822 Object *o;
cec736d2 823
de190aef 824 r = journal_file_move_to_object(f, OBJECT_DATA, p, &o);
cec736d2
LP
825 if (r < 0)
826 return r;
827
807e17f0 828 if (le64toh(o->data.hash) != hash)
85a131e8 829 goto next;
807e17f0 830
d89c8fdf 831 if (o->object.flags & OBJECT_COMPRESSION_MASK) {
3b1a55e1 832#if defined(HAVE_XZ) || defined(HAVE_LZ4)
fa1c4b51
ZJS
833 uint64_t l;
834 size_t rsize;
cec736d2 835
807e17f0
LP
836 l = le64toh(o->object.size);
837 if (l <= offsetof(Object, data.payload))
cec736d2
LP
838 return -EBADMSG;
839
807e17f0
LP
840 l -= offsetof(Object, data.payload);
841
d89c8fdf
ZJS
842 r = decompress_blob(o->object.flags & OBJECT_COMPRESSION_MASK,
843 o->data.payload, l, &f->compress_buffer, &f->compress_buffer_size, &rsize, 0);
844 if (r < 0)
845 return r;
807e17f0 846
b785c858 847 if (rsize == size &&
807e17f0
LP
848 memcmp(f->compress_buffer, data, size) == 0) {
849
850 if (ret)
851 *ret = o;
852
853 if (offset)
854 *offset = p;
855
856 return 1;
857 }
3b1a55e1
ZJS
858#else
859 return -EPROTONOSUPPORT;
860#endif
807e17f0
LP
861 } else if (le64toh(o->object.size) == osize &&
862 memcmp(o->data.payload, data, size) == 0) {
863
cec736d2
LP
864 if (ret)
865 *ret = o;
866
867 if (offset)
868 *offset = p;
869
de190aef 870 return 1;
cec736d2
LP
871 }
872
85a131e8 873 next:
cec736d2
LP
874 p = le64toh(o->data.next_hash_offset);
875 }
876
de190aef
LP
877 return 0;
878}
879
880int journal_file_find_data_object(
881 JournalFile *f,
882 const void *data, uint64_t size,
883 Object **ret, uint64_t *offset) {
884
885 uint64_t hash;
886
887 assert(f);
888 assert(data || size == 0);
889
890 hash = hash64(data, size);
891
892 return journal_file_find_data_object_with_hash(f,
893 data, size, hash,
894 ret, offset);
895}
896
3c1668da
LP
897static int journal_file_append_field(
898 JournalFile *f,
899 const void *field, uint64_t size,
900 Object **ret, uint64_t *offset) {
901
902 uint64_t hash, p;
903 uint64_t osize;
904 Object *o;
905 int r;
906
907 assert(f);
908 assert(field && size > 0);
909
910 hash = hash64(field, size);
911
912 r = journal_file_find_field_object_with_hash(f, field, size, hash, &o, &p);
913 if (r < 0)
914 return r;
915 else if (r > 0) {
916
917 if (ret)
918 *ret = o;
919
920 if (offset)
921 *offset = p;
922
923 return 0;
924 }
925
926 osize = offsetof(Object, field.payload) + size;
927 r = journal_file_append_object(f, OBJECT_FIELD, osize, &o, &p);
8c92d4bb
LP
928 if (r < 0)
929 return r;
3c1668da
LP
930
931 o->field.hash = htole64(hash);
932 memcpy(o->field.payload, field, size);
933
934 r = journal_file_link_field(f, o, p, hash);
935 if (r < 0)
936 return r;
937
938 /* The linking might have altered the window, so let's
939 * refresh our pointer */
940 r = journal_file_move_to_object(f, OBJECT_FIELD, p, &o);
941 if (r < 0)
942 return r;
943
944#ifdef HAVE_GCRYPT
945 r = journal_file_hmac_put_object(f, OBJECT_FIELD, o, p);
946 if (r < 0)
947 return r;
948#endif
949
950 if (ret)
951 *ret = o;
952
953 if (offset)
954 *offset = p;
955
956 return 0;
957}
958
48496df6
LP
959static int journal_file_append_data(
960 JournalFile *f,
961 const void *data, uint64_t size,
962 Object **ret, uint64_t *offset) {
963
de190aef
LP
964 uint64_t hash, p;
965 uint64_t osize;
966 Object *o;
d89c8fdf 967 int r, compression = 0;
3c1668da 968 const void *eq;
de190aef
LP
969
970 assert(f);
971 assert(data || size == 0);
972
973 hash = hash64(data, size);
974
975 r = journal_file_find_data_object_with_hash(f, data, size, hash, &o, &p);
976 if (r < 0)
977 return r;
978 else if (r > 0) {
979
980 if (ret)
981 *ret = o;
982
983 if (offset)
984 *offset = p;
985
986 return 0;
987 }
988
989 osize = offsetof(Object, data.payload) + size;
990 r = journal_file_append_object(f, OBJECT_DATA, osize, &o, &p);
cec736d2
LP
991 if (r < 0)
992 return r;
993
cec736d2 994 o->data.hash = htole64(hash);
807e17f0 995
d89c8fdf
ZJS
996#if defined(HAVE_XZ) || defined(HAVE_LZ4)
997 if (f->compress_xz &&
807e17f0 998 size >= COMPRESSION_SIZE_THRESHOLD) {
fa1c4b51 999 size_t rsize;
807e17f0 1000
d89c8fdf 1001 compression = compress_blob(data, size, o->data.payload, &rsize);
807e17f0 1002
d89c8fdf 1003 if (compression) {
807e17f0 1004 o->object.size = htole64(offsetof(Object, data.payload) + rsize);
d89c8fdf 1005 o->object.flags |= compression;
807e17f0 1006
fa1c4b51 1007 log_debug("Compressed data object %"PRIu64" -> %zu using %s",
d89c8fdf 1008 size, rsize, object_compressed_to_string(compression));
807e17f0
LP
1009 }
1010 }
1011#endif
1012
d89c8fdf 1013 if (!compression && size > 0)
807e17f0 1014 memcpy(o->data.payload, data, size);
cec736d2 1015
de190aef 1016 r = journal_file_link_data(f, o, p, hash);
cec736d2
LP
1017 if (r < 0)
1018 return r;
1019
48496df6
LP
1020 /* The linking might have altered the window, so let's
1021 * refresh our pointer */
1022 r = journal_file_move_to_object(f, OBJECT_DATA, p, &o);
1023 if (r < 0)
1024 return r;
1025
08c6f819
SL
1026 if (!data)
1027 eq = NULL;
1028 else
1029 eq = memchr(data, '=', size);
3c1668da 1030 if (eq && eq > data) {
748db592 1031 Object *fo = NULL;
3c1668da 1032 uint64_t fp;
3c1668da
LP
1033
1034 /* Create field object ... */
1035 r = journal_file_append_field(f, data, (uint8_t*) eq - (uint8_t*) data, &fo, &fp);
1036 if (r < 0)
1037 return r;
1038
1039 /* ... and link it in. */
1040 o->data.next_field_offset = fo->field.head_data_offset;
1041 fo->field.head_data_offset = le64toh(p);
1042 }
1043
5996c7c2
LP
1044#ifdef HAVE_GCRYPT
1045 r = journal_file_hmac_put_object(f, OBJECT_DATA, o, p);
1046 if (r < 0)
1047 return r;
1048#endif
1049
cec736d2
LP
1050 if (ret)
1051 *ret = o;
1052
1053 if (offset)
de190aef 1054 *offset = p;
cec736d2
LP
1055
1056 return 0;
1057}
1058
1059uint64_t journal_file_entry_n_items(Object *o) {
1060 assert(o);
b588975f
LP
1061
1062 if (o->object.type != OBJECT_ENTRY)
1063 return 0;
cec736d2
LP
1064
1065 return (le64toh(o->object.size) - offsetof(Object, entry.items)) / sizeof(EntryItem);
1066}
1067
0284adc6 1068uint64_t journal_file_entry_array_n_items(Object *o) {
de190aef 1069 assert(o);
b588975f
LP
1070
1071 if (o->object.type != OBJECT_ENTRY_ARRAY)
1072 return 0;
de190aef
LP
1073
1074 return (le64toh(o->object.size) - offsetof(Object, entry_array.items)) / sizeof(uint64_t);
1075}
1076
fb9a24b6
LP
1077uint64_t journal_file_hash_table_n_items(Object *o) {
1078 assert(o);
b588975f
LP
1079
1080 if (o->object.type != OBJECT_DATA_HASH_TABLE &&
1081 o->object.type != OBJECT_FIELD_HASH_TABLE)
1082 return 0;
fb9a24b6
LP
1083
1084 return (le64toh(o->object.size) - offsetof(Object, hash_table.items)) / sizeof(HashItem);
1085}
1086
de190aef 1087static int link_entry_into_array(JournalFile *f,
4fd052ae
FC
1088 le64_t *first,
1089 le64_t *idx,
de190aef 1090 uint64_t p) {
cec736d2 1091 int r;
de190aef
LP
1092 uint64_t n = 0, ap = 0, q, i, a, hidx;
1093 Object *o;
1094
cec736d2 1095 assert(f);
de190aef
LP
1096 assert(first);
1097 assert(idx);
1098 assert(p > 0);
cec736d2 1099
de190aef
LP
1100 a = le64toh(*first);
1101 i = hidx = le64toh(*idx);
1102 while (a > 0) {
1103
1104 r = journal_file_move_to_object(f, OBJECT_ENTRY_ARRAY, a, &o);
1105 if (r < 0)
1106 return r;
cec736d2 1107
de190aef
LP
1108 n = journal_file_entry_array_n_items(o);
1109 if (i < n) {
1110 o->entry_array.items[i] = htole64(p);
1111 *idx = htole64(hidx + 1);
1112 return 0;
1113 }
cec736d2 1114
de190aef
LP
1115 i -= n;
1116 ap = a;
1117 a = le64toh(o->entry_array.next_entry_array_offset);
1118 }
1119
1120 if (hidx > n)
1121 n = (hidx+1) * 2;
1122 else
1123 n = n * 2;
1124
1125 if (n < 4)
1126 n = 4;
1127
1128 r = journal_file_append_object(f, OBJECT_ENTRY_ARRAY,
1129 offsetof(Object, entry_array.items) + n * sizeof(uint64_t),
1130 &o, &q);
cec736d2
LP
1131 if (r < 0)
1132 return r;
1133
feb12d3e 1134#ifdef HAVE_GCRYPT
5996c7c2 1135 r = journal_file_hmac_put_object(f, OBJECT_ENTRY_ARRAY, o, q);
b0af6f41
LP
1136 if (r < 0)
1137 return r;
feb12d3e 1138#endif
b0af6f41 1139
de190aef 1140 o->entry_array.items[i] = htole64(p);
cec736d2 1141
de190aef 1142 if (ap == 0)
7be3aa17 1143 *first = htole64(q);
cec736d2 1144 else {
de190aef 1145 r = journal_file_move_to_object(f, OBJECT_ENTRY_ARRAY, ap, &o);
cec736d2
LP
1146 if (r < 0)
1147 return r;
1148
de190aef
LP
1149 o->entry_array.next_entry_array_offset = htole64(q);
1150 }
cec736d2 1151
2dee23eb
LP
1152 if (JOURNAL_HEADER_CONTAINS(f->header, n_entry_arrays))
1153 f->header->n_entry_arrays = htole64(le64toh(f->header->n_entry_arrays) + 1);
1154
de190aef
LP
1155 *idx = htole64(hidx + 1);
1156
1157 return 0;
1158}
cec736d2 1159
de190aef 1160static int link_entry_into_array_plus_one(JournalFile *f,
4fd052ae
FC
1161 le64_t *extra,
1162 le64_t *first,
1163 le64_t *idx,
de190aef
LP
1164 uint64_t p) {
1165
1166 int r;
1167
1168 assert(f);
1169 assert(extra);
1170 assert(first);
1171 assert(idx);
1172 assert(p > 0);
1173
1174 if (*idx == 0)
1175 *extra = htole64(p);
1176 else {
4fd052ae 1177 le64_t i;
de190aef 1178
7be3aa17 1179 i = htole64(le64toh(*idx) - 1);
de190aef
LP
1180 r = link_entry_into_array(f, first, &i, p);
1181 if (r < 0)
1182 return r;
cec736d2
LP
1183 }
1184
de190aef
LP
1185 *idx = htole64(le64toh(*idx) + 1);
1186 return 0;
1187}
1188
1189static int journal_file_link_entry_item(JournalFile *f, Object *o, uint64_t offset, uint64_t i) {
1190 uint64_t p;
1191 int r;
1192 assert(f);
1193 assert(o);
1194 assert(offset > 0);
1195
1196 p = le64toh(o->entry.items[i].object_offset);
1197 if (p == 0)
1198 return -EINVAL;
1199
1200 r = journal_file_move_to_object(f, OBJECT_DATA, p, &o);
cec736d2
LP
1201 if (r < 0)
1202 return r;
1203
de190aef
LP
1204 return link_entry_into_array_plus_one(f,
1205 &o->data.entry_offset,
1206 &o->data.entry_array_offset,
1207 &o->data.n_entries,
1208 offset);
cec736d2
LP
1209}
1210
1211static int journal_file_link_entry(JournalFile *f, Object *o, uint64_t offset) {
de190aef 1212 uint64_t n, i;
cec736d2
LP
1213 int r;
1214
1215 assert(f);
1216 assert(o);
1217 assert(offset > 0);
b588975f
LP
1218
1219 if (o->object.type != OBJECT_ENTRY)
1220 return -EINVAL;
cec736d2 1221
b788cc23
LP
1222 __sync_synchronize();
1223
cec736d2 1224 /* Link up the entry itself */
de190aef
LP
1225 r = link_entry_into_array(f,
1226 &f->header->entry_array_offset,
1227 &f->header->n_entries,
1228 offset);
1229 if (r < 0)
1230 return r;
cec736d2 1231
507f22bd 1232 /* log_debug("=> %s seqnr=%"PRIu64" n_entries=%"PRIu64, f->path, o->entry.seqnum, f->header->n_entries); */
cec736d2 1233
de190aef 1234 if (f->header->head_entry_realtime == 0)
0ac38b70 1235 f->header->head_entry_realtime = o->entry.realtime;
cec736d2 1236
0ac38b70 1237 f->header->tail_entry_realtime = o->entry.realtime;
de190aef
LP
1238 f->header->tail_entry_monotonic = o->entry.monotonic;
1239
1240 f->tail_entry_monotonic_valid = true;
cec736d2
LP
1241
1242 /* Link up the items */
1243 n = journal_file_entry_n_items(o);
1244 for (i = 0; i < n; i++) {
1245 r = journal_file_link_entry_item(f, o, offset, i);
1246 if (r < 0)
1247 return r;
1248 }
1249
cec736d2
LP
1250 return 0;
1251}
1252
1253static int journal_file_append_entry_internal(
1254 JournalFile *f,
1255 const dual_timestamp *ts,
1256 uint64_t xor_hash,
1257 const EntryItem items[], unsigned n_items,
de190aef 1258 uint64_t *seqnum,
cec736d2
LP
1259 Object **ret, uint64_t *offset) {
1260 uint64_t np;
1261 uint64_t osize;
1262 Object *o;
1263 int r;
1264
1265 assert(f);
1266 assert(items || n_items == 0);
de190aef 1267 assert(ts);
cec736d2
LP
1268
1269 osize = offsetof(Object, entry.items) + (n_items * sizeof(EntryItem));
1270
de190aef 1271 r = journal_file_append_object(f, OBJECT_ENTRY, osize, &o, &np);
cec736d2
LP
1272 if (r < 0)
1273 return r;
1274
d98cc1f2 1275 o->entry.seqnum = htole64(journal_file_entry_seqnum(f, seqnum));
cec736d2 1276 memcpy(o->entry.items, items, n_items * sizeof(EntryItem));
de190aef
LP
1277 o->entry.realtime = htole64(ts->realtime);
1278 o->entry.monotonic = htole64(ts->monotonic);
cec736d2
LP
1279 o->entry.xor_hash = htole64(xor_hash);
1280 o->entry.boot_id = f->header->boot_id;
1281
feb12d3e 1282#ifdef HAVE_GCRYPT
5996c7c2 1283 r = journal_file_hmac_put_object(f, OBJECT_ENTRY, o, np);
b0af6f41
LP
1284 if (r < 0)
1285 return r;
feb12d3e 1286#endif
b0af6f41 1287
cec736d2
LP
1288 r = journal_file_link_entry(f, o, np);
1289 if (r < 0)
1290 return r;
1291
1292 if (ret)
1293 *ret = o;
1294
1295 if (offset)
1296 *offset = np;
1297
1298 return 0;
1299}
1300
cf244689 1301void journal_file_post_change(JournalFile *f) {
50f20cfd
LP
1302 assert(f);
1303
1304 /* inotify() does not receive IN_MODIFY events from file
1305 * accesses done via mmap(). After each access we hence
1306 * trigger IN_MODIFY by truncating the journal file to its
1307 * current size which triggers IN_MODIFY. */
1308
bc85bfee
LP
1309 __sync_synchronize();
1310
50f20cfd 1311 if (ftruncate(f->fd, f->last_stat.st_size) < 0)
56f64d95 1312 log_error_errno(errno, "Failed to truncate file to its own size: %m");
50f20cfd
LP
1313}
1314
1f2da9ec
LP
1315static int entry_item_cmp(const void *_a, const void *_b) {
1316 const EntryItem *a = _a, *b = _b;
1317
1318 if (le64toh(a->object_offset) < le64toh(b->object_offset))
1319 return -1;
1320 if (le64toh(a->object_offset) > le64toh(b->object_offset))
1321 return 1;
1322 return 0;
1323}
1324
de190aef 1325int journal_file_append_entry(JournalFile *f, const dual_timestamp *ts, const struct iovec iovec[], unsigned n_iovec, uint64_t *seqnum, Object **ret, uint64_t *offset) {
cec736d2
LP
1326 unsigned i;
1327 EntryItem *items;
1328 int r;
1329 uint64_t xor_hash = 0;
de190aef 1330 struct dual_timestamp _ts;
cec736d2
LP
1331
1332 assert(f);
1333 assert(iovec || n_iovec == 0);
1334
de190aef
LP
1335 if (!ts) {
1336 dual_timestamp_get(&_ts);
1337 ts = &_ts;
1338 }
1339
1340 if (f->tail_entry_monotonic_valid &&
1341 ts->monotonic < le64toh(f->header->tail_entry_monotonic))
1342 return -EINVAL;
1343
feb12d3e 1344#ifdef HAVE_GCRYPT
7560fffc
LP
1345 r = journal_file_maybe_append_tag(f, ts->realtime);
1346 if (r < 0)
1347 return r;
feb12d3e 1348#endif
7560fffc 1349
64825d3c 1350 /* alloca() can't take 0, hence let's allocate at least one */
9607d947 1351 items = alloca(sizeof(EntryItem) * MAX(1u, n_iovec));
cec736d2
LP
1352
1353 for (i = 0; i < n_iovec; i++) {
1354 uint64_t p;
1355 Object *o;
1356
1357 r = journal_file_append_data(f, iovec[i].iov_base, iovec[i].iov_len, &o, &p);
1358 if (r < 0)
cf244689 1359 return r;
cec736d2
LP
1360
1361 xor_hash ^= le64toh(o->data.hash);
1362 items[i].object_offset = htole64(p);
de7b95cd 1363 items[i].hash = o->data.hash;
cec736d2
LP
1364 }
1365
1f2da9ec
LP
1366 /* Order by the position on disk, in order to improve seek
1367 * times for rotating media. */
7ff7394d 1368 qsort_safe(items, n_iovec, sizeof(EntryItem), entry_item_cmp);
1f2da9ec 1369
de190aef 1370 r = journal_file_append_entry_internal(f, ts, xor_hash, items, n_iovec, seqnum, ret, offset);
cec736d2 1371
fa6ac760
LP
1372 /* If the memory mapping triggered a SIGBUS then we return an
1373 * IO error and ignore the error code passed down to us, since
1374 * it is very likely just an effect of a nullified replacement
1375 * mapping page */
1376
1377 if (mmap_cache_got_sigbus(f->mmap, f->fd))
1378 r = -EIO;
1379
50f20cfd
LP
1380 journal_file_post_change(f);
1381
cec736d2
LP
1382 return r;
1383}
1384
a4bcff5b 1385typedef struct ChainCacheItem {
fb099c8d 1386 uint64_t first; /* the array at the beginning of the chain */
a4bcff5b
LP
1387 uint64_t array; /* the cached array */
1388 uint64_t begin; /* the first item in the cached array */
1389 uint64_t total; /* the total number of items in all arrays before this one in the chain */
f268980d 1390 uint64_t last_index; /* the last index we looked at, to optimize locality when bisecting */
a4bcff5b
LP
1391} ChainCacheItem;
1392
1393static void chain_cache_put(
4743015d 1394 OrderedHashmap *h,
a4bcff5b
LP
1395 ChainCacheItem *ci,
1396 uint64_t first,
1397 uint64_t array,
1398 uint64_t begin,
f268980d
LP
1399 uint64_t total,
1400 uint64_t last_index) {
a4bcff5b
LP
1401
1402 if (!ci) {
34741aa3
LP
1403 /* If the chain item to cache for this chain is the
1404 * first one it's not worth caching anything */
1405 if (array == first)
1406 return;
1407
29433089 1408 if (ordered_hashmap_size(h) >= CHAIN_CACHE_MAX) {
4743015d 1409 ci = ordered_hashmap_steal_first(h);
29433089
LP
1410 assert(ci);
1411 } else {
a4bcff5b
LP
1412 ci = new(ChainCacheItem, 1);
1413 if (!ci)
1414 return;
1415 }
1416
1417 ci->first = first;
1418
4743015d 1419 if (ordered_hashmap_put(h, &ci->first, ci) < 0) {
a4bcff5b
LP
1420 free(ci);
1421 return;
1422 }
1423 } else
1424 assert(ci->first == first);
1425
1426 ci->array = array;
1427 ci->begin = begin;
1428 ci->total = total;
f268980d 1429 ci->last_index = last_index;
a4bcff5b
LP
1430}
1431
f268980d
LP
1432static int generic_array_get(
1433 JournalFile *f,
1434 uint64_t first,
1435 uint64_t i,
1436 Object **ret, uint64_t *offset) {
de190aef 1437
cec736d2 1438 Object *o;
a4bcff5b 1439 uint64_t p = 0, a, t = 0;
cec736d2 1440 int r;
a4bcff5b 1441 ChainCacheItem *ci;
cec736d2
LP
1442
1443 assert(f);
1444
de190aef 1445 a = first;
a4bcff5b
LP
1446
1447 /* Try the chain cache first */
4743015d 1448 ci = ordered_hashmap_get(f->chain_cache, &first);
a4bcff5b
LP
1449 if (ci && i > ci->total) {
1450 a = ci->array;
1451 i -= ci->total;
1452 t = ci->total;
1453 }
1454
de190aef 1455 while (a > 0) {
a4bcff5b 1456 uint64_t k;
cec736d2 1457
de190aef
LP
1458 r = journal_file_move_to_object(f, OBJECT_ENTRY_ARRAY, a, &o);
1459 if (r < 0)
1460 return r;
cec736d2 1461
a4bcff5b
LP
1462 k = journal_file_entry_array_n_items(o);
1463 if (i < k) {
de190aef 1464 p = le64toh(o->entry_array.items[i]);
a4bcff5b 1465 goto found;
cec736d2
LP
1466 }
1467
a4bcff5b
LP
1468 i -= k;
1469 t += k;
de190aef
LP
1470 a = le64toh(o->entry_array.next_entry_array_offset);
1471 }
1472
a4bcff5b
LP
1473 return 0;
1474
1475found:
1476 /* Let's cache this item for the next invocation */
af13a6b0 1477 chain_cache_put(f->chain_cache, ci, first, a, le64toh(o->entry_array.items[0]), t, i);
de190aef
LP
1478
1479 r = journal_file_move_to_object(f, OBJECT_ENTRY, p, &o);
1480 if (r < 0)
1481 return r;
1482
1483 if (ret)
1484 *ret = o;
1485
1486 if (offset)
1487 *offset = p;
1488
1489 return 1;
1490}
1491
f268980d
LP
1492static int generic_array_get_plus_one(
1493 JournalFile *f,
1494 uint64_t extra,
1495 uint64_t first,
1496 uint64_t i,
1497 Object **ret, uint64_t *offset) {
de190aef
LP
1498
1499 Object *o;
1500
1501 assert(f);
1502
1503 if (i == 0) {
1504 int r;
1505
1506 r = journal_file_move_to_object(f, OBJECT_ENTRY, extra, &o);
cec736d2
LP
1507 if (r < 0)
1508 return r;
1509
de190aef
LP
1510 if (ret)
1511 *ret = o;
cec736d2 1512
de190aef
LP
1513 if (offset)
1514 *offset = extra;
cec736d2 1515
de190aef 1516 return 1;
cec736d2
LP
1517 }
1518
de190aef
LP
1519 return generic_array_get(f, first, i-1, ret, offset);
1520}
cec736d2 1521
de190aef
LP
1522enum {
1523 TEST_FOUND,
1524 TEST_LEFT,
1525 TEST_RIGHT
1526};
cec736d2 1527
f268980d
LP
1528static int generic_array_bisect(
1529 JournalFile *f,
1530 uint64_t first,
1531 uint64_t n,
1532 uint64_t needle,
1533 int (*test_object)(JournalFile *f, uint64_t p, uint64_t needle),
1534 direction_t direction,
1535 Object **ret,
1536 uint64_t *offset,
1537 uint64_t *idx) {
1538
1539 uint64_t a, p, t = 0, i = 0, last_p = 0, last_index = (uint64_t) -1;
de190aef
LP
1540 bool subtract_one = false;
1541 Object *o, *array = NULL;
1542 int r;
a4bcff5b 1543 ChainCacheItem *ci;
cec736d2 1544
de190aef
LP
1545 assert(f);
1546 assert(test_object);
cec736d2 1547
a4bcff5b 1548 /* Start with the first array in the chain */
de190aef 1549 a = first;
a4bcff5b 1550
4743015d 1551 ci = ordered_hashmap_get(f->chain_cache, &first);
a4bcff5b
LP
1552 if (ci && n > ci->total) {
1553 /* Ah, we have iterated this bisection array chain
1554 * previously! Let's see if we can skip ahead in the
1555 * chain, as far as the last time. But we can't jump
1556 * backwards in the chain, so let's check that
1557 * first. */
1558
1559 r = test_object(f, ci->begin, needle);
1560 if (r < 0)
1561 return r;
1562
1563 if (r == TEST_LEFT) {
f268980d 1564 /* OK, what we are looking for is right of the
a4bcff5b
LP
1565 * begin of this EntryArray, so let's jump
1566 * straight to previously cached array in the
1567 * chain */
1568
1569 a = ci->array;
1570 n -= ci->total;
1571 t = ci->total;
f268980d 1572 last_index = ci->last_index;
a4bcff5b
LP
1573 }
1574 }
1575
de190aef
LP
1576 while (a > 0) {
1577 uint64_t left, right, k, lp;
1578
1579 r = journal_file_move_to_object(f, OBJECT_ENTRY_ARRAY, a, &array);
cec736d2
LP
1580 if (r < 0)
1581 return r;
1582
de190aef
LP
1583 k = journal_file_entry_array_n_items(array);
1584 right = MIN(k, n);
1585 if (right <= 0)
1586 return 0;
cec736d2 1587
de190aef
LP
1588 i = right - 1;
1589 lp = p = le64toh(array->entry_array.items[i]);
1590 if (p <= 0)
1591 return -EBADMSG;
cec736d2 1592
de190aef
LP
1593 r = test_object(f, p, needle);
1594 if (r < 0)
1595 return r;
cec736d2 1596
de190aef
LP
1597 if (r == TEST_FOUND)
1598 r = direction == DIRECTION_DOWN ? TEST_RIGHT : TEST_LEFT;
1599
1600 if (r == TEST_RIGHT) {
1601 left = 0;
1602 right -= 1;
f268980d
LP
1603
1604 if (last_index != (uint64_t) -1) {
1605 assert(last_index <= right);
1606
1607 /* If we cached the last index we
1608 * looked at, let's try to not to jump
1609 * too wildly around and see if we can
1610 * limit the range to look at early to
1611 * the immediate neighbors of the last
1612 * index we looked at. */
1613
1614 if (last_index > 0) {
1615 uint64_t x = last_index - 1;
1616
1617 p = le64toh(array->entry_array.items[x]);
1618 if (p <= 0)
1619 return -EBADMSG;
1620
1621 r = test_object(f, p, needle);
1622 if (r < 0)
1623 return r;
1624
1625 if (r == TEST_FOUND)
1626 r = direction == DIRECTION_DOWN ? TEST_RIGHT : TEST_LEFT;
1627
1628 if (r == TEST_RIGHT)
1629 right = x;
1630 else
1631 left = x + 1;
1632 }
1633
1634 if (last_index < right) {
1635 uint64_t y = last_index + 1;
1636
1637 p = le64toh(array->entry_array.items[y]);
1638 if (p <= 0)
1639 return -EBADMSG;
1640
1641 r = test_object(f, p, needle);
1642 if (r < 0)
1643 return r;
1644
1645 if (r == TEST_FOUND)
1646 r = direction == DIRECTION_DOWN ? TEST_RIGHT : TEST_LEFT;
1647
1648 if (r == TEST_RIGHT)
1649 right = y;
1650 else
1651 left = y + 1;
1652 }
f268980d
LP
1653 }
1654
de190aef
LP
1655 for (;;) {
1656 if (left == right) {
1657 if (direction == DIRECTION_UP)
1658 subtract_one = true;
1659
1660 i = left;
1661 goto found;
1662 }
1663
1664 assert(left < right);
de190aef 1665 i = (left + right) / 2;
f268980d 1666
de190aef
LP
1667 p = le64toh(array->entry_array.items[i]);
1668 if (p <= 0)
1669 return -EBADMSG;
1670
1671 r = test_object(f, p, needle);
1672 if (r < 0)
1673 return r;
cec736d2 1674
de190aef
LP
1675 if (r == TEST_FOUND)
1676 r = direction == DIRECTION_DOWN ? TEST_RIGHT : TEST_LEFT;
1677
1678 if (r == TEST_RIGHT)
1679 right = i;
1680 else
1681 left = i + 1;
1682 }
1683 }
1684
2173cbf8 1685 if (k >= n) {
cbdca852
LP
1686 if (direction == DIRECTION_UP) {
1687 i = n;
1688 subtract_one = true;
1689 goto found;
1690 }
1691
cec736d2 1692 return 0;
cbdca852 1693 }
cec736d2 1694
de190aef
LP
1695 last_p = lp;
1696
1697 n -= k;
1698 t += k;
f268980d 1699 last_index = (uint64_t) -1;
de190aef 1700 a = le64toh(array->entry_array.next_entry_array_offset);
cec736d2
LP
1701 }
1702
1703 return 0;
de190aef
LP
1704
1705found:
1706 if (subtract_one && t == 0 && i == 0)
1707 return 0;
1708
a4bcff5b 1709 /* Let's cache this item for the next invocation */
af13a6b0 1710 chain_cache_put(f->chain_cache, ci, first, a, le64toh(array->entry_array.items[0]), t, subtract_one ? (i > 0 ? i-1 : (uint64_t) -1) : i);
a4bcff5b 1711
de190aef
LP
1712 if (subtract_one && i == 0)
1713 p = last_p;
1714 else if (subtract_one)
1715 p = le64toh(array->entry_array.items[i-1]);
1716 else
1717 p = le64toh(array->entry_array.items[i]);
1718
1719 r = journal_file_move_to_object(f, OBJECT_ENTRY, p, &o);
1720 if (r < 0)
1721 return r;
1722
1723 if (ret)
1724 *ret = o;
1725
1726 if (offset)
1727 *offset = p;
1728
1729 if (idx)
cbdca852 1730 *idx = t + i + (subtract_one ? -1 : 0);
de190aef
LP
1731
1732 return 1;
cec736d2
LP
1733}
1734
f268980d
LP
1735static int generic_array_bisect_plus_one(
1736 JournalFile *f,
1737 uint64_t extra,
1738 uint64_t first,
1739 uint64_t n,
1740 uint64_t needle,
1741 int (*test_object)(JournalFile *f, uint64_t p, uint64_t needle),
1742 direction_t direction,
1743 Object **ret,
1744 uint64_t *offset,
1745 uint64_t *idx) {
de190aef 1746
cec736d2 1747 int r;
cbdca852
LP
1748 bool step_back = false;
1749 Object *o;
cec736d2
LP
1750
1751 assert(f);
de190aef 1752 assert(test_object);
cec736d2 1753
de190aef
LP
1754 if (n <= 0)
1755 return 0;
cec736d2 1756
de190aef
LP
1757 /* This bisects the array in object 'first', but first checks
1758 * an extra */
de190aef
LP
1759 r = test_object(f, extra, needle);
1760 if (r < 0)
1761 return r;
a536e261
LP
1762
1763 if (r == TEST_FOUND)
1764 r = direction == DIRECTION_DOWN ? TEST_RIGHT : TEST_LEFT;
1765
cbdca852
LP
1766 /* if we are looking with DIRECTION_UP then we need to first
1767 see if in the actual array there is a matching entry, and
1768 return the last one of that. But if there isn't any we need
1769 to return this one. Hence remember this, and return it
1770 below. */
1771 if (r == TEST_LEFT)
1772 step_back = direction == DIRECTION_UP;
de190aef 1773
cbdca852
LP
1774 if (r == TEST_RIGHT) {
1775 if (direction == DIRECTION_DOWN)
1776 goto found;
1777 else
1778 return 0;
a536e261 1779 }
cec736d2 1780
de190aef
LP
1781 r = generic_array_bisect(f, first, n-1, needle, test_object, direction, ret, offset, idx);
1782
cbdca852
LP
1783 if (r == 0 && step_back)
1784 goto found;
1785
ecf68b1d 1786 if (r > 0 && idx)
de190aef
LP
1787 (*idx) ++;
1788
1789 return r;
cbdca852
LP
1790
1791found:
1792 r = journal_file_move_to_object(f, OBJECT_ENTRY, extra, &o);
1793 if (r < 0)
1794 return r;
1795
1796 if (ret)
1797 *ret = o;
1798
1799 if (offset)
1800 *offset = extra;
1801
1802 if (idx)
1803 *idx = 0;
1804
1805 return 1;
1806}
1807
44a6b1b6 1808_pure_ static int test_object_offset(JournalFile *f, uint64_t p, uint64_t needle) {
cbdca852
LP
1809 assert(f);
1810 assert(p > 0);
1811
1812 if (p == needle)
1813 return TEST_FOUND;
1814 else if (p < needle)
1815 return TEST_LEFT;
1816 else
1817 return TEST_RIGHT;
1818}
1819
de190aef
LP
1820static int test_object_seqnum(JournalFile *f, uint64_t p, uint64_t needle) {
1821 Object *o;
1822 int r;
1823
1824 assert(f);
1825 assert(p > 0);
1826
1827 r = journal_file_move_to_object(f, OBJECT_ENTRY, p, &o);
cec736d2
LP
1828 if (r < 0)
1829 return r;
1830
de190aef
LP
1831 if (le64toh(o->entry.seqnum) == needle)
1832 return TEST_FOUND;
1833 else if (le64toh(o->entry.seqnum) < needle)
1834 return TEST_LEFT;
1835 else
1836 return TEST_RIGHT;
1837}
cec736d2 1838
de190aef
LP
1839int journal_file_move_to_entry_by_seqnum(
1840 JournalFile *f,
1841 uint64_t seqnum,
1842 direction_t direction,
1843 Object **ret,
1844 uint64_t *offset) {
1845
1846 return generic_array_bisect(f,
1847 le64toh(f->header->entry_array_offset),
1848 le64toh(f->header->n_entries),
1849 seqnum,
1850 test_object_seqnum,
1851 direction,
1852 ret, offset, NULL);
1853}
cec736d2 1854
de190aef
LP
1855static int test_object_realtime(JournalFile *f, uint64_t p, uint64_t needle) {
1856 Object *o;
1857 int r;
1858
1859 assert(f);
1860 assert(p > 0);
1861
1862 r = journal_file_move_to_object(f, OBJECT_ENTRY, p, &o);
1863 if (r < 0)
1864 return r;
1865
1866 if (le64toh(o->entry.realtime) == needle)
1867 return TEST_FOUND;
1868 else if (le64toh(o->entry.realtime) < needle)
1869 return TEST_LEFT;
1870 else
1871 return TEST_RIGHT;
cec736d2
LP
1872}
1873
de190aef
LP
1874int journal_file_move_to_entry_by_realtime(
1875 JournalFile *f,
1876 uint64_t realtime,
1877 direction_t direction,
1878 Object **ret,
1879 uint64_t *offset) {
1880
1881 return generic_array_bisect(f,
1882 le64toh(f->header->entry_array_offset),
1883 le64toh(f->header->n_entries),
1884 realtime,
1885 test_object_realtime,
1886 direction,
1887 ret, offset, NULL);
1888}
1889
1890static int test_object_monotonic(JournalFile *f, uint64_t p, uint64_t needle) {
1891 Object *o;
1892 int r;
1893
1894 assert(f);
1895 assert(p > 0);
1896
1897 r = journal_file_move_to_object(f, OBJECT_ENTRY, p, &o);
1898 if (r < 0)
1899 return r;
1900
1901 if (le64toh(o->entry.monotonic) == needle)
1902 return TEST_FOUND;
1903 else if (le64toh(o->entry.monotonic) < needle)
1904 return TEST_LEFT;
1905 else
1906 return TEST_RIGHT;
1907}
1908
47838ab3
ZJS
1909static inline int find_data_object_by_boot_id(
1910 JournalFile *f,
1911 sd_id128_t boot_id,
1912 Object **o,
1913 uint64_t *b) {
1914 char t[sizeof("_BOOT_ID=")-1 + 32 + 1] = "_BOOT_ID=";
1915
1916 sd_id128_to_string(boot_id, t + 9);
1917 return journal_file_find_data_object(f, t, sizeof(t) - 1, o, b);
1918}
1919
de190aef
LP
1920int journal_file_move_to_entry_by_monotonic(
1921 JournalFile *f,
1922 sd_id128_t boot_id,
1923 uint64_t monotonic,
1924 direction_t direction,
1925 Object **ret,
1926 uint64_t *offset) {
1927
de190aef
LP
1928 Object *o;
1929 int r;
1930
cbdca852 1931 assert(f);
de190aef 1932
47838ab3 1933 r = find_data_object_by_boot_id(f, boot_id, &o, NULL);
de190aef
LP
1934 if (r < 0)
1935 return r;
cbdca852 1936 if (r == 0)
de190aef
LP
1937 return -ENOENT;
1938
1939 return generic_array_bisect_plus_one(f,
1940 le64toh(o->data.entry_offset),
1941 le64toh(o->data.entry_array_offset),
1942 le64toh(o->data.n_entries),
1943 monotonic,
1944 test_object_monotonic,
1945 direction,
1946 ret, offset, NULL);
1947}
1948
1fc605b0 1949void journal_file_reset_location(JournalFile *f) {
6573ef05 1950 f->location_type = LOCATION_HEAD;
1fc605b0 1951 f->current_offset = 0;
6573ef05
MS
1952 f->current_seqnum = 0;
1953 f->current_realtime = 0;
1954 f->current_monotonic = 0;
1955 zero(f->current_boot_id);
1956 f->current_xor_hash = 0;
1957}
1958
1959void journal_file_save_location(JournalFile *f, direction_t direction, Object *o, uint64_t offset) {
1960 f->last_direction = direction;
1961 f->location_type = LOCATION_SEEK;
1962 f->current_offset = offset;
1963 f->current_seqnum = le64toh(o->entry.seqnum);
1964 f->current_realtime = le64toh(o->entry.realtime);
1965 f->current_monotonic = le64toh(o->entry.monotonic);
1966 f->current_boot_id = o->entry.boot_id;
1967 f->current_xor_hash = le64toh(o->entry.xor_hash);
1fc605b0
MS
1968}
1969
d8ae66d7
MS
1970int journal_file_compare_locations(JournalFile *af, JournalFile *bf) {
1971 assert(af);
1972 assert(bf);
1973 assert(af->location_type == LOCATION_SEEK);
1974 assert(bf->location_type == LOCATION_SEEK);
1975
1976 /* If contents and timestamps match, these entries are
1977 * identical, even if the seqnum does not match */
1978 if (sd_id128_equal(af->current_boot_id, bf->current_boot_id) &&
1979 af->current_monotonic == bf->current_monotonic &&
1980 af->current_realtime == bf->current_realtime &&
1981 af->current_xor_hash == bf->current_xor_hash)
1982 return 0;
1983
1984 if (sd_id128_equal(af->header->seqnum_id, bf->header->seqnum_id)) {
1985
1986 /* If this is from the same seqnum source, compare
1987 * seqnums */
1988 if (af->current_seqnum < bf->current_seqnum)
1989 return -1;
1990 if (af->current_seqnum > bf->current_seqnum)
1991 return 1;
1992
1993 /* Wow! This is weird, different data but the same
1994 * seqnums? Something is borked, but let's make the
1995 * best of it and compare by time. */
1996 }
1997
1998 if (sd_id128_equal(af->current_boot_id, bf->current_boot_id)) {
1999
2000 /* If the boot id matches, compare monotonic time */
2001 if (af->current_monotonic < bf->current_monotonic)
2002 return -1;
2003 if (af->current_monotonic > bf->current_monotonic)
2004 return 1;
2005 }
2006
2007 /* Otherwise, compare UTC time */
2008 if (af->current_realtime < bf->current_realtime)
2009 return -1;
2010 if (af->current_realtime > bf->current_realtime)
2011 return 1;
2012
2013 /* Finally, compare by contents */
2014 if (af->current_xor_hash < bf->current_xor_hash)
2015 return -1;
2016 if (af->current_xor_hash > bf->current_xor_hash)
2017 return 1;
2018
2019 return 0;
2020}
2021
de190aef
LP
2022int journal_file_next_entry(
2023 JournalFile *f,
f534928a 2024 uint64_t p,
de190aef
LP
2025 direction_t direction,
2026 Object **ret, uint64_t *offset) {
2027
fb099c8d 2028 uint64_t i, n, ofs;
cec736d2
LP
2029 int r;
2030
2031 assert(f);
de190aef
LP
2032
2033 n = le64toh(f->header->n_entries);
2034 if (n <= 0)
2035 return 0;
cec736d2 2036
f534928a 2037 if (p == 0)
de190aef 2038 i = direction == DIRECTION_DOWN ? 0 : n - 1;
cec736d2 2039 else {
de190aef
LP
2040 r = generic_array_bisect(f,
2041 le64toh(f->header->entry_array_offset),
2042 le64toh(f->header->n_entries),
2043 p,
2044 test_object_offset,
2045 DIRECTION_DOWN,
2046 NULL, NULL,
2047 &i);
2048 if (r <= 0)
2049 return r;
2050
2051 if (direction == DIRECTION_DOWN) {
2052 if (i >= n - 1)
2053 return 0;
2054
2055 i++;
2056 } else {
2057 if (i <= 0)
2058 return 0;
2059
2060 i--;
2061 }
cec736d2
LP
2062 }
2063
de190aef 2064 /* And jump to it */
fb099c8d
ZJS
2065 r = generic_array_get(f,
2066 le64toh(f->header->entry_array_offset),
2067 i,
2068 ret, &ofs);
2069 if (r <= 0)
2070 return r;
2071
2072 if (p > 0 &&
2073 (direction == DIRECTION_DOWN ? ofs <= p : ofs >= p)) {
2074 log_debug("%s: entry array corrupted at entry %"PRIu64,
2075 f->path, i);
2076 return -EBADMSG;
2077 }
2078
2079 if (offset)
2080 *offset = ofs;
2081
2082 return 1;
de190aef 2083}
cec736d2 2084
de190aef
LP
2085int journal_file_next_entry_for_data(
2086 JournalFile *f,
2087 Object *o, uint64_t p,
2088 uint64_t data_offset,
2089 direction_t direction,
2090 Object **ret, uint64_t *offset) {
2091
2092 uint64_t n, i;
cec736d2 2093 int r;
de190aef 2094 Object *d;
cec736d2
LP
2095
2096 assert(f);
de190aef 2097 assert(p > 0 || !o);
cec736d2 2098
de190aef 2099 r = journal_file_move_to_object(f, OBJECT_DATA, data_offset, &d);
466ccd92 2100 if (r < 0)
de190aef 2101 return r;
cec736d2 2102
de190aef
LP
2103 n = le64toh(d->data.n_entries);
2104 if (n <= 0)
2105 return n;
cec736d2 2106
de190aef
LP
2107 if (!o)
2108 i = direction == DIRECTION_DOWN ? 0 : n - 1;
2109 else {
2110 if (o->object.type != OBJECT_ENTRY)
2111 return -EINVAL;
cec736d2 2112
de190aef
LP
2113 r = generic_array_bisect_plus_one(f,
2114 le64toh(d->data.entry_offset),
2115 le64toh(d->data.entry_array_offset),
2116 le64toh(d->data.n_entries),
2117 p,
2118 test_object_offset,
2119 DIRECTION_DOWN,
2120 NULL, NULL,
2121 &i);
2122
2123 if (r <= 0)
cec736d2
LP
2124 return r;
2125
de190aef
LP
2126 if (direction == DIRECTION_DOWN) {
2127 if (i >= n - 1)
2128 return 0;
cec736d2 2129
de190aef
LP
2130 i++;
2131 } else {
2132 if (i <= 0)
2133 return 0;
cec736d2 2134
de190aef
LP
2135 i--;
2136 }
cec736d2 2137
de190aef 2138 }
cec736d2 2139
de190aef
LP
2140 return generic_array_get_plus_one(f,
2141 le64toh(d->data.entry_offset),
2142 le64toh(d->data.entry_array_offset),
2143 i,
2144 ret, offset);
2145}
cec736d2 2146
cbdca852
LP
2147int journal_file_move_to_entry_by_offset_for_data(
2148 JournalFile *f,
2149 uint64_t data_offset,
2150 uint64_t p,
2151 direction_t direction,
2152 Object **ret, uint64_t *offset) {
2153
2154 int r;
2155 Object *d;
2156
2157 assert(f);
2158
2159 r = journal_file_move_to_object(f, OBJECT_DATA, data_offset, &d);
2160 if (r < 0)
2161 return r;
2162
2163 return generic_array_bisect_plus_one(f,
2164 le64toh(d->data.entry_offset),
2165 le64toh(d->data.entry_array_offset),
2166 le64toh(d->data.n_entries),
2167 p,
2168 test_object_offset,
2169 direction,
2170 ret, offset, NULL);
2171}
2172
2173int journal_file_move_to_entry_by_monotonic_for_data(
2174 JournalFile *f,
2175 uint64_t data_offset,
2176 sd_id128_t boot_id,
2177 uint64_t monotonic,
2178 direction_t direction,
2179 Object **ret, uint64_t *offset) {
2180
cbdca852
LP
2181 Object *o, *d;
2182 int r;
2183 uint64_t b, z;
2184
2185 assert(f);
2186
2187 /* First, seek by time */
47838ab3 2188 r = find_data_object_by_boot_id(f, boot_id, &o, &b);
cbdca852
LP
2189 if (r < 0)
2190 return r;
2191 if (r == 0)
2192 return -ENOENT;
2193
2194 r = generic_array_bisect_plus_one(f,
2195 le64toh(o->data.entry_offset),
2196 le64toh(o->data.entry_array_offset),
2197 le64toh(o->data.n_entries),
2198 monotonic,
2199 test_object_monotonic,
2200 direction,
2201 NULL, &z, NULL);
2202 if (r <= 0)
2203 return r;
2204
2205 /* And now, continue seeking until we find an entry that
2206 * exists in both bisection arrays */
2207
2208 for (;;) {
2209 Object *qo;
2210 uint64_t p, q;
2211
2212 r = journal_file_move_to_object(f, OBJECT_DATA, data_offset, &d);
2213 if (r < 0)
2214 return r;
2215
2216 r = generic_array_bisect_plus_one(f,
2217 le64toh(d->data.entry_offset),
2218 le64toh(d->data.entry_array_offset),
2219 le64toh(d->data.n_entries),
2220 z,
2221 test_object_offset,
2222 direction,
2223 NULL, &p, NULL);
2224 if (r <= 0)
2225 return r;
2226
2227 r = journal_file_move_to_object(f, OBJECT_DATA, b, &o);
2228 if (r < 0)
2229 return r;
2230
2231 r = generic_array_bisect_plus_one(f,
2232 le64toh(o->data.entry_offset),
2233 le64toh(o->data.entry_array_offset),
2234 le64toh(o->data.n_entries),
2235 p,
2236 test_object_offset,
2237 direction,
2238 &qo, &q, NULL);
2239
2240 if (r <= 0)
2241 return r;
2242
2243 if (p == q) {
2244 if (ret)
2245 *ret = qo;
2246 if (offset)
2247 *offset = q;
2248
2249 return 1;
2250 }
2251
2252 z = q;
2253 }
cbdca852
LP
2254}
2255
de190aef
LP
2256int journal_file_move_to_entry_by_seqnum_for_data(
2257 JournalFile *f,
2258 uint64_t data_offset,
2259 uint64_t seqnum,
2260 direction_t direction,
2261 Object **ret, uint64_t *offset) {
cec736d2 2262
de190aef
LP
2263 Object *d;
2264 int r;
cec736d2 2265
91a31dde
LP
2266 assert(f);
2267
de190aef 2268 r = journal_file_move_to_object(f, OBJECT_DATA, data_offset, &d);
91a31dde 2269 if (r < 0)
de190aef 2270 return r;
cec736d2 2271
de190aef
LP
2272 return generic_array_bisect_plus_one(f,
2273 le64toh(d->data.entry_offset),
2274 le64toh(d->data.entry_array_offset),
2275 le64toh(d->data.n_entries),
2276 seqnum,
2277 test_object_seqnum,
2278 direction,
2279 ret, offset, NULL);
2280}
cec736d2 2281
de190aef
LP
2282int journal_file_move_to_entry_by_realtime_for_data(
2283 JournalFile *f,
2284 uint64_t data_offset,
2285 uint64_t realtime,
2286 direction_t direction,
2287 Object **ret, uint64_t *offset) {
2288
2289 Object *d;
2290 int r;
2291
91a31dde
LP
2292 assert(f);
2293
de190aef 2294 r = journal_file_move_to_object(f, OBJECT_DATA, data_offset, &d);
91a31dde 2295 if (r < 0)
de190aef
LP
2296 return r;
2297
2298 return generic_array_bisect_plus_one(f,
2299 le64toh(d->data.entry_offset),
2300 le64toh(d->data.entry_array_offset),
2301 le64toh(d->data.n_entries),
2302 realtime,
2303 test_object_realtime,
2304 direction,
2305 ret, offset, NULL);
cec736d2
LP
2306}
2307
0284adc6 2308void journal_file_dump(JournalFile *f) {
7560fffc 2309 Object *o;
7560fffc 2310 int r;
0284adc6 2311 uint64_t p;
7560fffc
LP
2312
2313 assert(f);
2314
0284adc6 2315 journal_file_print_header(f);
7560fffc 2316
0284adc6
LP
2317 p = le64toh(f->header->header_size);
2318 while (p != 0) {
d05089d8 2319 r = journal_file_move_to_object(f, OBJECT_UNUSED, p, &o);
0284adc6
LP
2320 if (r < 0)
2321 goto fail;
7560fffc 2322
0284adc6 2323 switch (o->object.type) {
d98cc1f2 2324
0284adc6
LP
2325 case OBJECT_UNUSED:
2326 printf("Type: OBJECT_UNUSED\n");
2327 break;
d98cc1f2 2328
0284adc6
LP
2329 case OBJECT_DATA:
2330 printf("Type: OBJECT_DATA\n");
2331 break;
7560fffc 2332
3c1668da
LP
2333 case OBJECT_FIELD:
2334 printf("Type: OBJECT_FIELD\n");
2335 break;
2336
0284adc6 2337 case OBJECT_ENTRY:
507f22bd
ZJS
2338 printf("Type: OBJECT_ENTRY seqnum=%"PRIu64" monotonic=%"PRIu64" realtime=%"PRIu64"\n",
2339 le64toh(o->entry.seqnum),
2340 le64toh(o->entry.monotonic),
2341 le64toh(o->entry.realtime));
0284adc6 2342 break;
7560fffc 2343
0284adc6
LP
2344 case OBJECT_FIELD_HASH_TABLE:
2345 printf("Type: OBJECT_FIELD_HASH_TABLE\n");
2346 break;
7560fffc 2347
0284adc6
LP
2348 case OBJECT_DATA_HASH_TABLE:
2349 printf("Type: OBJECT_DATA_HASH_TABLE\n");
2350 break;
7560fffc 2351
0284adc6
LP
2352 case OBJECT_ENTRY_ARRAY:
2353 printf("Type: OBJECT_ENTRY_ARRAY\n");
2354 break;
7560fffc 2355
0284adc6 2356 case OBJECT_TAG:
507f22bd
ZJS
2357 printf("Type: OBJECT_TAG seqnum=%"PRIu64" epoch=%"PRIu64"\n",
2358 le64toh(o->tag.seqnum),
2359 le64toh(o->tag.epoch));
0284adc6 2360 break;
3c1668da
LP
2361
2362 default:
2363 printf("Type: unknown (%u)\n", o->object.type);
2364 break;
0284adc6 2365 }
7560fffc 2366
d89c8fdf
ZJS
2367 if (o->object.flags & OBJECT_COMPRESSION_MASK)
2368 printf("Flags: %s\n",
2369 object_compressed_to_string(o->object.flags & OBJECT_COMPRESSION_MASK));
7560fffc 2370
0284adc6
LP
2371 if (p == le64toh(f->header->tail_object_offset))
2372 p = 0;
2373 else
2374 p = p + ALIGN64(le64toh(o->object.size));
2375 }
7560fffc 2376
0284adc6
LP
2377 return;
2378fail:
2379 log_error("File corrupt");
7560fffc
LP
2380}
2381
718fe4b1
ZJS
2382static const char* format_timestamp_safe(char *buf, size_t l, usec_t t) {
2383 const char *x;
2384
2385 x = format_timestamp(buf, l, t);
2386 if (x)
2387 return x;
2388 return " --- ";
2389}
2390
0284adc6 2391void journal_file_print_header(JournalFile *f) {
2765b7bb 2392 char a[33], b[33], c[33], d[33];
ed375beb 2393 char x[FORMAT_TIMESTAMP_MAX], y[FORMAT_TIMESTAMP_MAX], z[FORMAT_TIMESTAMP_MAX];
a1a03e30
LP
2394 struct stat st;
2395 char bytes[FORMAT_BYTES_MAX];
7560fffc
LP
2396
2397 assert(f);
7560fffc 2398
0284adc6
LP
2399 printf("File Path: %s\n"
2400 "File ID: %s\n"
2401 "Machine ID: %s\n"
2402 "Boot ID: %s\n"
2403 "Sequential Number ID: %s\n"
2404 "State: %s\n"
2405 "Compatible Flags:%s%s\n"
d89c8fdf 2406 "Incompatible Flags:%s%s%s\n"
507f22bd
ZJS
2407 "Header size: %"PRIu64"\n"
2408 "Arena size: %"PRIu64"\n"
2409 "Data Hash Table Size: %"PRIu64"\n"
2410 "Field Hash Table Size: %"PRIu64"\n"
0284adc6 2411 "Rotate Suggested: %s\n"
507f22bd
ZJS
2412 "Head Sequential Number: %"PRIu64"\n"
2413 "Tail Sequential Number: %"PRIu64"\n"
0284adc6 2414 "Head Realtime Timestamp: %s\n"
3223f44f 2415 "Tail Realtime Timestamp: %s\n"
ed375beb 2416 "Tail Monotonic Timestamp: %s\n"
507f22bd
ZJS
2417 "Objects: %"PRIu64"\n"
2418 "Entry Objects: %"PRIu64"\n",
0284adc6
LP
2419 f->path,
2420 sd_id128_to_string(f->header->file_id, a),
2421 sd_id128_to_string(f->header->machine_id, b),
2422 sd_id128_to_string(f->header->boot_id, c),
2765b7bb 2423 sd_id128_to_string(f->header->seqnum_id, d),
3223f44f
LP
2424 f->header->state == STATE_OFFLINE ? "OFFLINE" :
2425 f->header->state == STATE_ONLINE ? "ONLINE" :
2426 f->header->state == STATE_ARCHIVED ? "ARCHIVED" : "UNKNOWN",
8088cbd3 2427 JOURNAL_HEADER_SEALED(f->header) ? " SEALED" : "",
d89c8fdf
ZJS
2428 (le32toh(f->header->compatible_flags) & ~HEADER_COMPATIBLE_ANY) ? " ???" : "",
2429 JOURNAL_HEADER_COMPRESSED_XZ(f->header) ? " COMPRESSED-XZ" : "",
2430 JOURNAL_HEADER_COMPRESSED_LZ4(f->header) ? " COMPRESSED-LZ4" : "",
2431 (le32toh(f->header->incompatible_flags) & ~HEADER_INCOMPATIBLE_ANY) ? " ???" : "",
507f22bd
ZJS
2432 le64toh(f->header->header_size),
2433 le64toh(f->header->arena_size),
2434 le64toh(f->header->data_hash_table_size) / sizeof(HashItem),
2435 le64toh(f->header->field_hash_table_size) / sizeof(HashItem),
fb0951b0 2436 yes_no(journal_file_rotate_suggested(f, 0)),
507f22bd
ZJS
2437 le64toh(f->header->head_entry_seqnum),
2438 le64toh(f->header->tail_entry_seqnum),
718fe4b1
ZJS
2439 format_timestamp_safe(x, sizeof(x), le64toh(f->header->head_entry_realtime)),
2440 format_timestamp_safe(y, sizeof(y), le64toh(f->header->tail_entry_realtime)),
ed375beb 2441 format_timespan(z, sizeof(z), le64toh(f->header->tail_entry_monotonic), USEC_PER_MSEC),
507f22bd
ZJS
2442 le64toh(f->header->n_objects),
2443 le64toh(f->header->n_entries));
7560fffc 2444
0284adc6 2445 if (JOURNAL_HEADER_CONTAINS(f->header, n_data))
507f22bd 2446 printf("Data Objects: %"PRIu64"\n"
0284adc6 2447 "Data Hash Table Fill: %.1f%%\n",
507f22bd 2448 le64toh(f->header->n_data),
0284adc6 2449 100.0 * (double) le64toh(f->header->n_data) / ((double) (le64toh(f->header->data_hash_table_size) / sizeof(HashItem))));
7560fffc 2450
0284adc6 2451 if (JOURNAL_HEADER_CONTAINS(f->header, n_fields))
507f22bd 2452 printf("Field Objects: %"PRIu64"\n"
0284adc6 2453 "Field Hash Table Fill: %.1f%%\n",
507f22bd 2454 le64toh(f->header->n_fields),
0284adc6 2455 100.0 * (double) le64toh(f->header->n_fields) / ((double) (le64toh(f->header->field_hash_table_size) / sizeof(HashItem))));
3223f44f
LP
2456
2457 if (JOURNAL_HEADER_CONTAINS(f->header, n_tags))
507f22bd
ZJS
2458 printf("Tag Objects: %"PRIu64"\n",
2459 le64toh(f->header->n_tags));
3223f44f 2460 if (JOURNAL_HEADER_CONTAINS(f->header, n_entry_arrays))
507f22bd
ZJS
2461 printf("Entry Array Objects: %"PRIu64"\n",
2462 le64toh(f->header->n_entry_arrays));
a1a03e30
LP
2463
2464 if (fstat(f->fd, &st) >= 0)
2465 printf("Disk usage: %s\n", format_bytes(bytes, sizeof(bytes), (off_t) st.st_blocks * 512ULL));
7560fffc
LP
2466}
2467
0284adc6
LP
2468int journal_file_open(
2469 const char *fname,
2470 int flags,
2471 mode_t mode,
2472 bool compress,
baed47c3 2473 bool seal,
0284adc6
LP
2474 JournalMetrics *metrics,
2475 MMapCache *mmap_cache,
2476 JournalFile *template,
2477 JournalFile **ret) {
7560fffc 2478
fa6ac760 2479 bool newly_created = false;
0284adc6 2480 JournalFile *f;
fa6ac760 2481 void *h;
0284adc6 2482 int r;
7560fffc 2483
0284adc6 2484 assert(fname);
0559d3a5 2485 assert(ret);
7560fffc 2486
0284adc6
LP
2487 if ((flags & O_ACCMODE) != O_RDONLY &&
2488 (flags & O_ACCMODE) != O_RDWR)
2489 return -EINVAL;
7560fffc 2490
a0108012
LP
2491 if (!endswith(fname, ".journal") &&
2492 !endswith(fname, ".journal~"))
0284adc6 2493 return -EINVAL;
7560fffc 2494
0284adc6
LP
2495 f = new0(JournalFile, 1);
2496 if (!f)
2497 return -ENOMEM;
7560fffc 2498
0284adc6
LP
2499 f->fd = -1;
2500 f->mode = mode;
7560fffc 2501
0284adc6
LP
2502 f->flags = flags;
2503 f->prot = prot_from_flags(flags);
2504 f->writable = (flags & O_ACCMODE) != O_RDONLY;
92261977 2505#if defined(HAVE_LZ4)
d89c8fdf
ZJS
2506 f->compress_lz4 = compress;
2507#elif defined(HAVE_XZ)
2508 f->compress_xz = compress;
48b61739 2509#endif
49a32d43 2510#ifdef HAVE_GCRYPT
baed47c3 2511 f->seal = seal;
49a32d43 2512#endif
7560fffc 2513
0284adc6
LP
2514 if (mmap_cache)
2515 f->mmap = mmap_cache_ref(mmap_cache);
2516 else {
84168d80 2517 f->mmap = mmap_cache_new();
0284adc6
LP
2518 if (!f->mmap) {
2519 r = -ENOMEM;
2520 goto fail;
2521 }
2522 }
7560fffc 2523
0284adc6
LP
2524 f->path = strdup(fname);
2525 if (!f->path) {
2526 r = -ENOMEM;
2527 goto fail;
2528 }
7560fffc 2529
4743015d 2530 f->chain_cache = ordered_hashmap_new(&uint64_hash_ops);
a4bcff5b
LP
2531 if (!f->chain_cache) {
2532 r = -ENOMEM;
2533 goto fail;
2534 }
2535
0284adc6
LP
2536 f->fd = open(f->path, f->flags|O_CLOEXEC, f->mode);
2537 if (f->fd < 0) {
2538 r = -errno;
2539 goto fail;
7560fffc 2540 }
7560fffc 2541
0284adc6
LP
2542 if (fstat(f->fd, &f->last_stat) < 0) {
2543 r = -errno;
2544 goto fail;
2545 }
7560fffc 2546
0284adc6 2547 if (f->last_stat.st_size == 0 && f->writable) {
fb0951b0
LP
2548 /* Let's attach the creation time to the journal file,
2549 * so that the vacuuming code knows the age of this
2550 * file even if the file might end up corrupted one
2551 * day... Ideally we'd just use the creation time many
2552 * file systems maintain for each file, but there is
2553 * currently no usable API to query this, hence let's
2554 * emulate this via extended attributes. If extended
2555 * attributes are not supported we'll just skip this,
7517e174 2556 * and rely solely on mtime/atime/ctime of the file. */
fb0951b0 2557
4a4d89b6 2558 fd_setcrtime(f->fd, now(CLOCK_REALTIME));
7560fffc 2559
feb12d3e 2560#ifdef HAVE_GCRYPT
0284adc6 2561 /* Try to load the FSPRG state, and if we can't, then
baed47c3 2562 * just don't do sealing */
49a32d43
LP
2563 if (f->seal) {
2564 r = journal_file_fss_load(f);
2565 if (r < 0)
2566 f->seal = false;
2567 }
feb12d3e 2568#endif
7560fffc 2569
0284adc6
LP
2570 r = journal_file_init_header(f, template);
2571 if (r < 0)
2572 goto fail;
7560fffc 2573
0284adc6
LP
2574 if (fstat(f->fd, &f->last_stat) < 0) {
2575 r = -errno;
2576 goto fail;
2577 }
fb0951b0
LP
2578
2579 newly_created = true;
0284adc6 2580 }
7560fffc 2581
0284adc6
LP
2582 if (f->last_stat.st_size < (off_t) HEADER_SIZE_MIN) {
2583 r = -EIO;
2584 goto fail;
2585 }
7560fffc 2586
fa6ac760
LP
2587 r = mmap_cache_get(f->mmap, f->fd, f->prot, CONTEXT_HEADER, true, 0, PAGE_ALIGN(sizeof(Header)), &f->last_stat, &h);
2588 if (r < 0) {
0284adc6
LP
2589 r = -errno;
2590 goto fail;
2591 }
7560fffc 2592
fa6ac760
LP
2593 f->header = h;
2594
0284adc6
LP
2595 if (!newly_created) {
2596 r = journal_file_verify_header(f);
2597 if (r < 0)
2598 goto fail;
2599 }
7560fffc 2600
feb12d3e 2601#ifdef HAVE_GCRYPT
0284adc6 2602 if (!newly_created && f->writable) {
baed47c3 2603 r = journal_file_fss_load(f);
0284adc6
LP
2604 if (r < 0)
2605 goto fail;
2606 }
feb12d3e 2607#endif
cec736d2
LP
2608
2609 if (f->writable) {
4a92baf3
LP
2610 if (metrics) {
2611 journal_default_metrics(metrics, f->fd);
2612 f->metrics = *metrics;
2613 } else if (template)
2614 f->metrics = template->metrics;
2615
cec736d2
LP
2616 r = journal_file_refresh_header(f);
2617 if (r < 0)
2618 goto fail;
2619 }
2620
feb12d3e 2621#ifdef HAVE_GCRYPT
baed47c3 2622 r = journal_file_hmac_setup(f);
14d10188
LP
2623 if (r < 0)
2624 goto fail;
feb12d3e 2625#endif
14d10188 2626
cec736d2 2627 if (newly_created) {
de190aef 2628 r = journal_file_setup_field_hash_table(f);
cec736d2
LP
2629 if (r < 0)
2630 goto fail;
2631
de190aef 2632 r = journal_file_setup_data_hash_table(f);
cec736d2
LP
2633 if (r < 0)
2634 goto fail;
7560fffc 2635
feb12d3e 2636#ifdef HAVE_GCRYPT
7560fffc
LP
2637 r = journal_file_append_first_tag(f);
2638 if (r < 0)
2639 goto fail;
feb12d3e 2640#endif
cec736d2
LP
2641 }
2642
de190aef 2643 r = journal_file_map_field_hash_table(f);
cec736d2
LP
2644 if (r < 0)
2645 goto fail;
2646
de190aef 2647 r = journal_file_map_data_hash_table(f);
cec736d2
LP
2648 if (r < 0)
2649 goto fail;
2650
fa6ac760
LP
2651 if (mmap_cache_got_sigbus(f->mmap, f->fd)) {
2652 r = -EIO;
2653 goto fail;
2654 }
2655
0559d3a5 2656 *ret = f;
cec736d2
LP
2657 return 0;
2658
2659fail:
fa6ac760
LP
2660 if (f->fd >= 0 && mmap_cache_got_sigbus(f->mmap, f->fd))
2661 r = -EIO;
2662
cec736d2
LP
2663 journal_file_close(f);
2664
2665 return r;
2666}
0ac38b70 2667
baed47c3 2668int journal_file_rotate(JournalFile **f, bool compress, bool seal) {
57535f47 2669 _cleanup_free_ char *p = NULL;
0ac38b70
LP
2670 size_t l;
2671 JournalFile *old_file, *new_file = NULL;
2672 int r;
2673
2674 assert(f);
2675 assert(*f);
2676
2677 old_file = *f;
2678
2679 if (!old_file->writable)
2680 return -EINVAL;
2681
2682 if (!endswith(old_file->path, ".journal"))
2683 return -EINVAL;
2684
2685 l = strlen(old_file->path);
57535f47
ZJS
2686 r = asprintf(&p, "%.*s@" SD_ID128_FORMAT_STR "-%016"PRIx64"-%016"PRIx64".journal",
2687 (int) l - 8, old_file->path,
2688 SD_ID128_FORMAT_VAL(old_file->header->seqnum_id),
2689 le64toh((*f)->header->head_entry_seqnum),
2690 le64toh((*f)->header->head_entry_realtime));
2691 if (r < 0)
0ac38b70
LP
2692 return -ENOMEM;
2693
0ac38b70 2694 r = rename(old_file->path, p);
0ac38b70
LP
2695 if (r < 0)
2696 return -errno;
2697
ccdbaf91 2698 old_file->header->state = STATE_ARCHIVED;
0ac38b70 2699
baed47c3 2700 r = journal_file_open(old_file->path, old_file->flags, old_file->mode, compress, seal, NULL, old_file->mmap, old_file, &new_file);
0ac38b70
LP
2701 journal_file_close(old_file);
2702
2703 *f = new_file;
2704 return r;
2705}
2706
9447a7f1
LP
2707int journal_file_open_reliably(
2708 const char *fname,
2709 int flags,
2710 mode_t mode,
7560fffc 2711 bool compress,
baed47c3 2712 bool seal,
4a92baf3 2713 JournalMetrics *metrics,
27370278 2714 MMapCache *mmap_cache,
9447a7f1
LP
2715 JournalFile *template,
2716 JournalFile **ret) {
2717
2718 int r;
2719 size_t l;
ed375beb 2720 _cleanup_free_ char *p = NULL;
9447a7f1 2721
baed47c3 2722 r = journal_file_open(fname, flags, mode, compress, seal,
27370278 2723 metrics, mmap_cache, template, ret);
0071d9f1
LP
2724 if (r != -EBADMSG && /* corrupted */
2725 r != -ENODATA && /* truncated */
2726 r != -EHOSTDOWN && /* other machine */
a1a1898f
LP
2727 r != -EPROTONOSUPPORT && /* incompatible feature */
2728 r != -EBUSY && /* unclean shutdown */
fa6ac760
LP
2729 r != -ESHUTDOWN && /* already archived */
2730 r != -EIO /* IO error, including SIGBUS on mmap */)
9447a7f1
LP
2731 return r;
2732
2733 if ((flags & O_ACCMODE) == O_RDONLY)
2734 return r;
2735
2736 if (!(flags & O_CREAT))
2737 return r;
2738
7560fffc
LP
2739 if (!endswith(fname, ".journal"))
2740 return r;
2741
5c70eab4
LP
2742 /* The file is corrupted. Rotate it away and try it again (but only once) */
2743
9447a7f1 2744 l = strlen(fname);
9bf3b535 2745 if (asprintf(&p, "%.*s@%016llx-%016" PRIx64 ".journal~",
57535f47 2746 (int) l - 8, fname,
9447a7f1 2747 (unsigned long long) now(CLOCK_REALTIME),
9bf3b535 2748 random_u64()) < 0)
9447a7f1
LP
2749 return -ENOMEM;
2750
2751 r = rename(fname, p);
9447a7f1
LP
2752 if (r < 0)
2753 return -errno;
2754
a1a1898f 2755 log_warning("File %s corrupted or uncleanly shut down, renaming and replacing.", fname);
9447a7f1 2756
baed47c3 2757 return journal_file_open(fname, flags, mode, compress, seal,
27370278 2758 metrics, mmap_cache, template, ret);
9447a7f1
LP
2759}
2760
cf244689
LP
2761int journal_file_copy_entry(JournalFile *from, JournalFile *to, Object *o, uint64_t p, uint64_t *seqnum, Object **ret, uint64_t *offset) {
2762 uint64_t i, n;
2763 uint64_t q, xor_hash = 0;
2764 int r;
2765 EntryItem *items;
2766 dual_timestamp ts;
2767
2768 assert(from);
2769 assert(to);
2770 assert(o);
2771 assert(p);
2772
2773 if (!to->writable)
2774 return -EPERM;
2775
2776 ts.monotonic = le64toh(o->entry.monotonic);
2777 ts.realtime = le64toh(o->entry.realtime);
2778
cf244689 2779 n = journal_file_entry_n_items(o);
4faa7004
TA
2780 /* alloca() can't take 0, hence let's allocate at least one */
2781 items = alloca(sizeof(EntryItem) * MAX(1u, n));
cf244689
LP
2782
2783 for (i = 0; i < n; i++) {
4fd052ae
FC
2784 uint64_t l, h;
2785 le64_t le_hash;
cf244689
LP
2786 size_t t;
2787 void *data;
2788 Object *u;
2789
2790 q = le64toh(o->entry.items[i].object_offset);
2791 le_hash = o->entry.items[i].hash;
2792
2793 r = journal_file_move_to_object(from, OBJECT_DATA, q, &o);
2794 if (r < 0)
2795 return r;
2796
2797 if (le_hash != o->data.hash)
2798 return -EBADMSG;
2799
2800 l = le64toh(o->object.size) - offsetof(Object, data.payload);
2801 t = (size_t) l;
2802
2803 /* We hit the limit on 32bit machines */
2804 if ((uint64_t) t != l)
2805 return -E2BIG;
2806
d89c8fdf 2807 if (o->object.flags & OBJECT_COMPRESSION_MASK) {
3b1a55e1 2808#if defined(HAVE_XZ) || defined(HAVE_LZ4)
fa1c4b51 2809 size_t rsize;
cf244689 2810
d89c8fdf
ZJS
2811 r = decompress_blob(o->object.flags & OBJECT_COMPRESSION_MASK,
2812 o->data.payload, l, &from->compress_buffer, &from->compress_buffer_size, &rsize, 0);
2813 if (r < 0)
2814 return r;
cf244689
LP
2815
2816 data = from->compress_buffer;
2817 l = rsize;
3b1a55e1
ZJS
2818#else
2819 return -EPROTONOSUPPORT;
2820#endif
cf244689
LP
2821 } else
2822 data = o->data.payload;
2823
2824 r = journal_file_append_data(to, data, l, &u, &h);
2825 if (r < 0)
2826 return r;
2827
2828 xor_hash ^= le64toh(u->data.hash);
2829 items[i].object_offset = htole64(h);
2830 items[i].hash = u->data.hash;
2831
2832 r = journal_file_move_to_object(from, OBJECT_ENTRY, p, &o);
2833 if (r < 0)
2834 return r;
2835 }
2836
fa6ac760
LP
2837 r = journal_file_append_entry_internal(to, &ts, xor_hash, items, n, seqnum, ret, offset);
2838
2839 if (mmap_cache_got_sigbus(to->mmap, to->fd))
2840 return -EIO;
2841
2842 return r;
cf244689 2843}
babfc091
LP
2844
2845void journal_default_metrics(JournalMetrics *m, int fd) {
2846 uint64_t fs_size = 0;
2847 struct statvfs ss;
a7bc2c2a 2848 char a[FORMAT_BYTES_MAX], b[FORMAT_BYTES_MAX], c[FORMAT_BYTES_MAX], d[FORMAT_BYTES_MAX];
babfc091
LP
2849
2850 assert(m);
2851 assert(fd >= 0);
2852
2853 if (fstatvfs(fd, &ss) >= 0)
2854 fs_size = ss.f_frsize * ss.f_blocks;
2855
2856 if (m->max_use == (uint64_t) -1) {
2857
2858 if (fs_size > 0) {
2859 m->max_use = PAGE_ALIGN(fs_size / 10); /* 10% of file system size */
2860
2861 if (m->max_use > DEFAULT_MAX_USE_UPPER)
2862 m->max_use = DEFAULT_MAX_USE_UPPER;
2863
2864 if (m->max_use < DEFAULT_MAX_USE_LOWER)
2865 m->max_use = DEFAULT_MAX_USE_LOWER;
2866 } else
2867 m->max_use = DEFAULT_MAX_USE_LOWER;
2868 } else {
2869 m->max_use = PAGE_ALIGN(m->max_use);
2870
2871 if (m->max_use < JOURNAL_FILE_SIZE_MIN*2)
2872 m->max_use = JOURNAL_FILE_SIZE_MIN*2;
2873 }
2874
2875 if (m->max_size == (uint64_t) -1) {
2876 m->max_size = PAGE_ALIGN(m->max_use / 8); /* 8 chunks */
2877
2878 if (m->max_size > DEFAULT_MAX_SIZE_UPPER)
2879 m->max_size = DEFAULT_MAX_SIZE_UPPER;
2880 } else
2881 m->max_size = PAGE_ALIGN(m->max_size);
2882
2883 if (m->max_size < JOURNAL_FILE_SIZE_MIN)
2884 m->max_size = JOURNAL_FILE_SIZE_MIN;
2885
2886 if (m->max_size*2 > m->max_use)
2887 m->max_use = m->max_size*2;
2888
2889 if (m->min_size == (uint64_t) -1)
2890 m->min_size = JOURNAL_FILE_SIZE_MIN;
2891 else {
2892 m->min_size = PAGE_ALIGN(m->min_size);
2893
2894 if (m->min_size < JOURNAL_FILE_SIZE_MIN)
2895 m->min_size = JOURNAL_FILE_SIZE_MIN;
2896
2897 if (m->min_size > m->max_size)
2898 m->max_size = m->min_size;
2899 }
2900
2901 if (m->keep_free == (uint64_t) -1) {
2902
2903 if (fs_size > 0) {
8621b110 2904 m->keep_free = PAGE_ALIGN(fs_size * 3 / 20); /* 15% of file system size */
babfc091
LP
2905
2906 if (m->keep_free > DEFAULT_KEEP_FREE_UPPER)
2907 m->keep_free = DEFAULT_KEEP_FREE_UPPER;
2908
2909 } else
2910 m->keep_free = DEFAULT_KEEP_FREE;
2911 }
2912
2b43f939
LP
2913 log_debug("Fixed max_use=%s max_size=%s min_size=%s keep_free=%s",
2914 format_bytes(a, sizeof(a), m->max_use),
2915 format_bytes(b, sizeof(b), m->max_size),
2916 format_bytes(c, sizeof(c), m->min_size),
2917 format_bytes(d, sizeof(d), m->keep_free));
babfc091 2918}
08984293
LP
2919
2920int journal_file_get_cutoff_realtime_usec(JournalFile *f, usec_t *from, usec_t *to) {
08984293
LP
2921 assert(f);
2922 assert(from || to);
2923
2924 if (from) {
162566a4
LP
2925 if (f->header->head_entry_realtime == 0)
2926 return -ENOENT;
08984293 2927
162566a4 2928 *from = le64toh(f->header->head_entry_realtime);
08984293
LP
2929 }
2930
2931 if (to) {
162566a4
LP
2932 if (f->header->tail_entry_realtime == 0)
2933 return -ENOENT;
08984293 2934
162566a4 2935 *to = le64toh(f->header->tail_entry_realtime);
08984293
LP
2936 }
2937
2938 return 1;
2939}
2940
2941int journal_file_get_cutoff_monotonic_usec(JournalFile *f, sd_id128_t boot_id, usec_t *from, usec_t *to) {
08984293
LP
2942 Object *o;
2943 uint64_t p;
2944 int r;
2945
2946 assert(f);
2947 assert(from || to);
2948
47838ab3 2949 r = find_data_object_by_boot_id(f, boot_id, &o, &p);
08984293
LP
2950 if (r <= 0)
2951 return r;
2952
2953 if (le64toh(o->data.n_entries) <= 0)
2954 return 0;
2955
2956 if (from) {
2957 r = journal_file_move_to_object(f, OBJECT_ENTRY, le64toh(o->data.entry_offset), &o);
2958 if (r < 0)
2959 return r;
2960
2961 *from = le64toh(o->entry.monotonic);
2962 }
2963
2964 if (to) {
2965 r = journal_file_move_to_object(f, OBJECT_DATA, p, &o);
2966 if (r < 0)
2967 return r;
2968
2969 r = generic_array_get_plus_one(f,
2970 le64toh(o->data.entry_offset),
2971 le64toh(o->data.entry_array_offset),
2972 le64toh(o->data.n_entries)-1,
2973 &o, NULL);
2974 if (r <= 0)
2975 return r;
2976
2977 *to = le64toh(o->entry.monotonic);
2978 }
2979
2980 return 1;
2981}
dca6219e 2982
fb0951b0 2983bool journal_file_rotate_suggested(JournalFile *f, usec_t max_file_usec) {
dca6219e
LP
2984 assert(f);
2985
2986 /* If we gained new header fields we gained new features,
2987 * hence suggest a rotation */
361f9cbc
LP
2988 if (le64toh(f->header->header_size) < sizeof(Header)) {
2989 log_debug("%s uses an outdated header, suggesting rotation.", f->path);
dca6219e 2990 return true;
361f9cbc 2991 }
dca6219e
LP
2992
2993 /* Let's check if the hash tables grew over a certain fill
2994 * level (75%, borrowing this value from Java's hash table
2995 * implementation), and if so suggest a rotation. To calculate
2996 * the fill level we need the n_data field, which only exists
2997 * in newer versions. */
2998
2999 if (JOURNAL_HEADER_CONTAINS(f->header, n_data))
361f9cbc 3000 if (le64toh(f->header->n_data) * 4ULL > (le64toh(f->header->data_hash_table_size) / sizeof(HashItem)) * 3ULL) {
507f22bd 3001 log_debug("Data hash table of %s has a fill level at %.1f (%"PRIu64" of %"PRIu64" items, %llu file size, %"PRIu64" bytes per hash table item), suggesting rotation.",
361f9cbc
LP
3002 f->path,
3003 100.0 * (double) le64toh(f->header->n_data) / ((double) (le64toh(f->header->data_hash_table_size) / sizeof(HashItem))),
507f22bd
ZJS
3004 le64toh(f->header->n_data),
3005 le64toh(f->header->data_hash_table_size) / sizeof(HashItem),
3006 (unsigned long long) f->last_stat.st_size,
3007 f->last_stat.st_size / le64toh(f->header->n_data));
dca6219e 3008 return true;
361f9cbc 3009 }
dca6219e
LP
3010
3011 if (JOURNAL_HEADER_CONTAINS(f->header, n_fields))
361f9cbc 3012 if (le64toh(f->header->n_fields) * 4ULL > (le64toh(f->header->field_hash_table_size) / sizeof(HashItem)) * 3ULL) {
507f22bd 3013 log_debug("Field hash table of %s has a fill level at %.1f (%"PRIu64" of %"PRIu64" items), suggesting rotation.",
361f9cbc
LP
3014 f->path,
3015 100.0 * (double) le64toh(f->header->n_fields) / ((double) (le64toh(f->header->field_hash_table_size) / sizeof(HashItem))),
507f22bd
ZJS
3016 le64toh(f->header->n_fields),
3017 le64toh(f->header->field_hash_table_size) / sizeof(HashItem));
dca6219e 3018 return true;
361f9cbc 3019 }
dca6219e 3020
0598fd4a
LP
3021 /* Are the data objects properly indexed by field objects? */
3022 if (JOURNAL_HEADER_CONTAINS(f->header, n_data) &&
3023 JOURNAL_HEADER_CONTAINS(f->header, n_fields) &&
3024 le64toh(f->header->n_data) > 0 &&
3025 le64toh(f->header->n_fields) == 0)
3026 return true;
3027
fb0951b0
LP
3028 if (max_file_usec > 0) {
3029 usec_t t, h;
3030
3031 h = le64toh(f->header->head_entry_realtime);
3032 t = now(CLOCK_REALTIME);
3033
3034 if (h > 0 && t > h + max_file_usec)
3035 return true;
3036 }
3037
dca6219e
LP
3038 return false;
3039}