]> git.ipfire.org Git - thirdparty/systemd.git/blame - src/journal/journal-file.c
Update TODO
[thirdparty/systemd.git] / src / journal / journal-file.c
CommitLineData
cec736d2
LP
1/*-*- Mode: C; c-basic-offset: 8; indent-tabs-mode: nil -*-*/
2
3/***
4 This file is part of systemd.
5
6 Copyright 2011 Lennart Poettering
7
8 systemd is free software; you can redistribute it and/or modify it
5430f7f2
LP
9 under the terms of the GNU Lesser General Public License as published by
10 the Free Software Foundation; either version 2.1 of the License, or
cec736d2
LP
11 (at your option) any later version.
12
13 systemd is distributed in the hope that it will be useful, but
14 WITHOUT ANY WARRANTY; without even the implied warranty of
15 MERCHANTABILITY or FITNESS FOR A PARTICULAR PURPOSE. See the GNU
5430f7f2 16 Lesser General Public License for more details.
cec736d2 17
5430f7f2 18 You should have received a copy of the GNU Lesser General Public License
cec736d2
LP
19 along with systemd; If not, see <http://www.gnu.org/licenses/>.
20***/
21
22#include <sys/mman.h>
23#include <errno.h>
24#include <sys/uio.h>
25#include <unistd.h>
26#include <sys/statvfs.h>
27#include <fcntl.h>
28#include <stddef.h>
fb0951b0 29
cec736d2
LP
30#include "journal-def.h"
31#include "journal-file.h"
0284adc6 32#include "journal-authenticate.h"
cec736d2 33#include "lookup3.h"
807e17f0 34#include "compress.h"
7560fffc 35#include "fsprg.h"
cec736d2 36
4a92baf3
LP
37#define DEFAULT_DATA_HASH_TABLE_SIZE (2047ULL*sizeof(HashItem))
38#define DEFAULT_FIELD_HASH_TABLE_SIZE (333ULL*sizeof(HashItem))
cec736d2 39
be19b7df 40#define COMPRESSION_SIZE_THRESHOLD (512ULL)
807e17f0 41
babfc091 42/* This is the minimum journal file size */
253f59df 43#define JOURNAL_FILE_SIZE_MIN (4ULL*1024ULL*1024ULL) /* 4 MiB */
babfc091
LP
44
45/* These are the lower and upper bounds if we deduce the max_use value
46 * from the file system size */
47#define DEFAULT_MAX_USE_LOWER (1ULL*1024ULL*1024ULL) /* 1 MiB */
48#define DEFAULT_MAX_USE_UPPER (4ULL*1024ULL*1024ULL*1024ULL) /* 4 GiB */
49
50/* This is the upper bound if we deduce max_size from max_use */
71100051 51#define DEFAULT_MAX_SIZE_UPPER (128ULL*1024ULL*1024ULL) /* 128 MiB */
babfc091
LP
52
53/* This is the upper bound if we deduce the keep_free value from the
54 * file system size */
55#define DEFAULT_KEEP_FREE_UPPER (4ULL*1024ULL*1024ULL*1024ULL) /* 4 GiB */
56
57/* This is the keep_free value when we can't determine the system
58 * size */
59#define DEFAULT_KEEP_FREE (1024ULL*1024ULL) /* 1 MB */
60
dca6219e
LP
61/* n_data was the first entry we added after the initial file format design */
62#define HEADER_SIZE_MIN ALIGN64(offsetof(Header, n_data))
cec736d2 63
a4bcff5b
LP
64/* How many entries to keep in the entry array chain cache at max */
65#define CHAIN_CACHE_MAX 20
66
a676e665
LP
67/* How much to increase the journal file size at once each time we allocate something new. */
68#define FILE_SIZE_INCREASE (8ULL*1024ULL*1024ULL) /* 8MB */
69
2678031a
LP
70/* Reread fstat() of the file for detecting deletions at least this often */
71#define LAST_STAT_REFRESH_USEC (5*USEC_PER_SEC)
72
fa6ac760
LP
73/* The mmap context to use for the header we pick as one above the last defined typed */
74#define CONTEXT_HEADER _OBJECT_TYPE_MAX
75
9588bc32 76static int journal_file_set_online(JournalFile *f) {
26687bf8
OS
77 assert(f);
78
79 if (!f->writable)
80 return -EPERM;
81
82 if (!(f->fd >= 0 && f->header))
83 return -EINVAL;
84
fa6ac760
LP
85 if (mmap_cache_got_sigbus(f->mmap, f->fd))
86 return -EIO;
87
26687bf8
OS
88 switch(f->header->state) {
89 case STATE_ONLINE:
90 return 0;
91
92 case STATE_OFFLINE:
93 f->header->state = STATE_ONLINE;
94 fsync(f->fd);
95 return 0;
96
97 default:
98 return -EINVAL;
99 }
100}
101
102int journal_file_set_offline(JournalFile *f) {
103 assert(f);
104
105 if (!f->writable)
106 return -EPERM;
107
108 if (!(f->fd >= 0 && f->header))
109 return -EINVAL;
110
111 if (f->header->state != STATE_ONLINE)
112 return 0;
113
114 fsync(f->fd);
115
fa6ac760
LP
116 if (mmap_cache_got_sigbus(f->mmap, f->fd))
117 return -EIO;
118
26687bf8
OS
119 f->header->state = STATE_OFFLINE;
120
fa6ac760
LP
121 if (mmap_cache_got_sigbus(f->mmap, f->fd))
122 return -EIO;
123
26687bf8
OS
124 fsync(f->fd);
125
126 return 0;
127}
128
cec736d2 129void journal_file_close(JournalFile *f) {
de190aef 130 assert(f);
cec736d2 131
feb12d3e 132#ifdef HAVE_GCRYPT
b0af6f41 133 /* Write the final tag */
c586dbf1 134 if (f->seal && f->writable)
b0af6f41 135 journal_file_append_tag(f);
feb12d3e 136#endif
b0af6f41 137
26687bf8 138 journal_file_set_offline(f);
cec736d2 139
fa6ac760
LP
140 if (f->mmap && f->fd >= 0)
141 mmap_cache_close_fd(f->mmap, f->fd);
cec736d2 142
03e334a1 143 safe_close(f->fd);
cec736d2 144 free(f->path);
807e17f0 145
16e9f408
LP
146 if (f->mmap)
147 mmap_cache_unref(f->mmap);
148
4743015d 149 ordered_hashmap_free_free(f->chain_cache);
a4bcff5b 150
d89c8fdf 151#if defined(HAVE_XZ) || defined(HAVE_LZ4)
807e17f0
LP
152 free(f->compress_buffer);
153#endif
154
7560fffc 155#ifdef HAVE_GCRYPT
baed47c3
LP
156 if (f->fss_file)
157 munmap(f->fss_file, PAGE_ALIGN(f->fss_file_size));
b7c9ae91
LP
158 else if (f->fsprg_state)
159 free(f->fsprg_state);
160
161 free(f->fsprg_seed);
7560fffc
LP
162
163 if (f->hmac)
164 gcry_md_close(f->hmac);
165#endif
166
cec736d2
LP
167 free(f);
168}
169
0ac38b70 170static int journal_file_init_header(JournalFile *f, JournalFile *template) {
d89c8fdf 171 Header h = {};
cec736d2
LP
172 ssize_t k;
173 int r;
174
175 assert(f);
176
7560fffc 177 memcpy(h.signature, HEADER_SIGNATURE, 8);
23b0b2b2 178 h.header_size = htole64(ALIGN64(sizeof(h)));
cec736d2 179
d89c8fdf
ZJS
180 h.incompatible_flags |= htole32(
181 f->compress_xz * HEADER_INCOMPATIBLE_COMPRESSED_XZ |
182 f->compress_lz4 * HEADER_INCOMPATIBLE_COMPRESSED_LZ4);
7560fffc 183
d89c8fdf
ZJS
184 h.compatible_flags = htole32(
185 f->seal * HEADER_COMPATIBLE_SEALED);
7560fffc 186
cec736d2
LP
187 r = sd_id128_randomize(&h.file_id);
188 if (r < 0)
189 return r;
190
0ac38b70
LP
191 if (template) {
192 h.seqnum_id = template->header->seqnum_id;
beec0085 193 h.tail_entry_seqnum = template->header->tail_entry_seqnum;
0ac38b70
LP
194 } else
195 h.seqnum_id = h.file_id;
cec736d2
LP
196
197 k = pwrite(f->fd, &h, sizeof(h), 0);
198 if (k < 0)
199 return -errno;
200
201 if (k != sizeof(h))
202 return -EIO;
203
204 return 0;
205}
206
207static int journal_file_refresh_header(JournalFile *f) {
de190aef 208 sd_id128_t boot_id;
fa6ac760 209 int r;
cec736d2
LP
210
211 assert(f);
212
213 r = sd_id128_get_machine(&f->header->machine_id);
214 if (r < 0)
215 return r;
216
de190aef 217 r = sd_id128_get_boot(&boot_id);
cec736d2
LP
218 if (r < 0)
219 return r;
220
de190aef
LP
221 if (sd_id128_equal(boot_id, f->header->boot_id))
222 f->tail_entry_monotonic_valid = true;
223
224 f->header->boot_id = boot_id;
225
fa6ac760 226 r = journal_file_set_online(f);
b788cc23 227
7560fffc 228 /* Sync the online state to disk */
a676e665 229 fsync(f->fd);
b788cc23 230
fa6ac760 231 return r;
cec736d2
LP
232}
233
234static int journal_file_verify_header(JournalFile *f) {
d89c8fdf
ZJS
235 uint32_t flags;
236
cec736d2
LP
237 assert(f);
238
7560fffc 239 if (memcmp(f->header->signature, HEADER_SIGNATURE, 8))
cec736d2
LP
240 return -EBADMSG;
241
7560fffc
LP
242 /* In both read and write mode we refuse to open files with
243 * incompatible flags we don't know */
d89c8fdf
ZJS
244 flags = le32toh(f->header->incompatible_flags);
245 if (flags & ~HEADER_INCOMPATIBLE_SUPPORTED) {
246 if (flags & ~HEADER_INCOMPATIBLE_ANY)
247 log_debug("Journal file %s has unknown incompatible flags %"PRIx32,
248 f->path, flags & ~HEADER_INCOMPATIBLE_ANY);
249 flags = (flags & HEADER_INCOMPATIBLE_ANY) & ~HEADER_INCOMPATIBLE_SUPPORTED;
250 if (flags)
251 log_debug("Journal file %s uses incompatible flags %"PRIx32
252 " disabled at compilation time.", f->path, flags);
cec736d2 253 return -EPROTONOSUPPORT;
d89c8fdf 254 }
cec736d2 255
7560fffc
LP
256 /* When open for writing we refuse to open files with
257 * compatible flags, too */
d89c8fdf
ZJS
258 flags = le32toh(f->header->compatible_flags);
259 if (f->writable && (flags & ~HEADER_COMPATIBLE_SUPPORTED)) {
260 if (flags & ~HEADER_COMPATIBLE_ANY)
261 log_debug("Journal file %s has unknown compatible flags %"PRIx32,
262 f->path, flags & ~HEADER_COMPATIBLE_ANY);
263 flags = (flags & HEADER_COMPATIBLE_ANY) & ~HEADER_COMPATIBLE_SUPPORTED;
264 if (flags)
265 log_debug("Journal file %s uses compatible flags %"PRIx32
266 " disabled at compilation time.", f->path, flags);
267 return -EPROTONOSUPPORT;
7560fffc
LP
268 }
269
db11ac1a
LP
270 if (f->header->state >= _STATE_MAX)
271 return -EBADMSG;
272
dca6219e
LP
273 /* The first addition was n_data, so check that we are at least this large */
274 if (le64toh(f->header->header_size) < HEADER_SIZE_MIN)
23b0b2b2
LP
275 return -EBADMSG;
276
8088cbd3 277 if (JOURNAL_HEADER_SEALED(f->header) && !JOURNAL_HEADER_CONTAINS(f->header, n_entry_arrays))
beec0085
LP
278 return -EBADMSG;
279
db11ac1a
LP
280 if ((le64toh(f->header->header_size) + le64toh(f->header->arena_size)) > (uint64_t) f->last_stat.st_size)
281 return -ENODATA;
282
283 if (le64toh(f->header->tail_object_offset) > (le64toh(f->header->header_size) + le64toh(f->header->arena_size)))
284 return -ENODATA;
285
7762e02b
LP
286 if (!VALID64(le64toh(f->header->data_hash_table_offset)) ||
287 !VALID64(le64toh(f->header->field_hash_table_offset)) ||
288 !VALID64(le64toh(f->header->tail_object_offset)) ||
289 !VALID64(le64toh(f->header->entry_array_offset)))
290 return -ENODATA;
291
cec736d2 292 if (f->writable) {
ccdbaf91 293 uint8_t state;
cec736d2
LP
294 sd_id128_t machine_id;
295 int r;
296
297 r = sd_id128_get_machine(&machine_id);
298 if (r < 0)
299 return r;
300
301 if (!sd_id128_equal(machine_id, f->header->machine_id))
302 return -EHOSTDOWN;
303
de190aef 304 state = f->header->state;
cec736d2 305
71fa6f00
LP
306 if (state == STATE_ONLINE) {
307 log_debug("Journal file %s is already online. Assuming unclean closing.", f->path);
308 return -EBUSY;
309 } else if (state == STATE_ARCHIVED)
cec736d2 310 return -ESHUTDOWN;
71fa6f00
LP
311 else if (state != STATE_OFFLINE) {
312 log_debug("Journal file %s has unknown state %u.", f->path, state);
313 return -EBUSY;
314 }
cec736d2
LP
315 }
316
d89c8fdf
ZJS
317 f->compress_xz = JOURNAL_HEADER_COMPRESSED_XZ(f->header);
318 f->compress_lz4 = JOURNAL_HEADER_COMPRESSED_LZ4(f->header);
c586dbf1 319
f1889c91 320 f->seal = JOURNAL_HEADER_SEALED(f->header);
7560fffc 321
cec736d2
LP
322 return 0;
323}
324
2678031a
LP
325static int journal_file_fstat(JournalFile *f) {
326 assert(f);
327 assert(f->fd >= 0);
328
329 if (fstat(f->fd, &f->last_stat) < 0)
330 return -errno;
331
332 f->last_stat_usec = now(CLOCK_MONOTONIC);
333
334 /* Refuse appending to files that are already deleted */
335 if (f->last_stat.st_nlink <= 0)
336 return -EIDRM;
337
338 return 0;
339}
340
cec736d2 341static int journal_file_allocate(JournalFile *f, uint64_t offset, uint64_t size) {
eda4b58b 342 uint64_t old_size, new_size;
fec2aa2f 343 int r;
cec736d2
LP
344
345 assert(f);
346
cec736d2 347 /* We assume that this file is not sparse, and we know that
38ac38b2 348 * for sure, since we always call posix_fallocate()
cec736d2
LP
349 * ourselves */
350
fa6ac760
LP
351 if (mmap_cache_got_sigbus(f->mmap, f->fd))
352 return -EIO;
353
cec736d2 354 old_size =
23b0b2b2 355 le64toh(f->header->header_size) +
cec736d2
LP
356 le64toh(f->header->arena_size);
357
bc85bfee 358 new_size = PAGE_ALIGN(offset + size);
23b0b2b2
LP
359 if (new_size < le64toh(f->header->header_size))
360 new_size = le64toh(f->header->header_size);
bc85bfee 361
2678031a
LP
362 if (new_size <= old_size) {
363
364 /* We already pre-allocated enough space, but before
365 * we write to it, let's check with fstat() if the
366 * file got deleted, in order make sure we don't throw
367 * away the data immediately. Don't check fstat() for
368 * all writes though, but only once ever 10s. */
369
370 if (f->last_stat_usec + LAST_STAT_REFRESH_USEC > now(CLOCK_MONOTONIC))
371 return 0;
372
373 return journal_file_fstat(f);
374 }
375
376 /* Allocate more space. */
cec736d2 377
a676e665 378 if (f->metrics.max_size > 0 && new_size > f->metrics.max_size)
bc85bfee 379 return -E2BIG;
cec736d2 380
a676e665 381 if (new_size > f->metrics.min_size && f->metrics.keep_free > 0) {
cec736d2
LP
382 struct statvfs svfs;
383
384 if (fstatvfs(f->fd, &svfs) >= 0) {
385 uint64_t available;
386
387 available = svfs.f_bfree * svfs.f_bsize;
388
bc85bfee
LP
389 if (available >= f->metrics.keep_free)
390 available -= f->metrics.keep_free;
cec736d2
LP
391 else
392 available = 0;
393
394 if (new_size - old_size > available)
395 return -E2BIG;
396 }
397 }
398
eda4b58b
LP
399 /* Increase by larger blocks at once */
400 new_size = ((new_size+FILE_SIZE_INCREASE-1) / FILE_SIZE_INCREASE) * FILE_SIZE_INCREASE;
401 if (f->metrics.max_size > 0 && new_size > f->metrics.max_size)
402 new_size = f->metrics.max_size;
403
bc85bfee
LP
404 /* Note that the glibc fallocate() fallback is very
405 inefficient, hence we try to minimize the allocation area
406 as we can. */
fec2aa2f
GV
407 r = posix_fallocate(f->fd, old_size, new_size - old_size);
408 if (r != 0)
409 return -r;
cec736d2 410
23b0b2b2 411 f->header->arena_size = htole64(new_size - le64toh(f->header->header_size));
cec736d2 412
2678031a 413 return journal_file_fstat(f);
cec736d2
LP
414}
415
78519831 416static unsigned type_to_context(ObjectType type) {
d3d3208f 417 /* One context for each type, plus one catch-all for the rest */
69adae51 418 assert_cc(_OBJECT_TYPE_MAX <= MMAP_CACHE_MAX_CONTEXTS);
fa6ac760 419 assert_cc(CONTEXT_HEADER < MMAP_CACHE_MAX_CONTEXTS);
d05089d8 420 return type > OBJECT_UNUSED && type < _OBJECT_TYPE_MAX ? type : 0;
d3d3208f
MS
421}
422
7a9dabea 423static int journal_file_move_to(JournalFile *f, ObjectType type, bool keep_always, uint64_t offset, uint64_t size, void **ret) {
2678031a
LP
424 int r;
425
cec736d2 426 assert(f);
cec736d2
LP
427 assert(ret);
428
7762e02b
LP
429 if (size <= 0)
430 return -EINVAL;
431
2a59ea54 432 /* Avoid SIGBUS on invalid accesses */
4bbdcdb3
LP
433 if (offset + size > (uint64_t) f->last_stat.st_size) {
434 /* Hmm, out of range? Let's refresh the fstat() data
435 * first, before we trust that check. */
436
2678031a
LP
437 r = journal_file_fstat(f);
438 if (r < 0)
439 return r;
440
441 if (offset + size > (uint64_t) f->last_stat.st_size)
4bbdcdb3
LP
442 return -EADDRNOTAVAIL;
443 }
444
7a9dabea 445 return mmap_cache_get(f->mmap, f->fd, f->prot, type_to_context(type), keep_always, offset, size, &f->last_stat, ret);
cec736d2
LP
446}
447
16e9f408
LP
448static uint64_t minimum_header_size(Object *o) {
449
b8e891e6 450 static const uint64_t table[] = {
16e9f408
LP
451 [OBJECT_DATA] = sizeof(DataObject),
452 [OBJECT_FIELD] = sizeof(FieldObject),
453 [OBJECT_ENTRY] = sizeof(EntryObject),
454 [OBJECT_DATA_HASH_TABLE] = sizeof(HashTableObject),
455 [OBJECT_FIELD_HASH_TABLE] = sizeof(HashTableObject),
456 [OBJECT_ENTRY_ARRAY] = sizeof(EntryArrayObject),
457 [OBJECT_TAG] = sizeof(TagObject),
458 };
459
460 if (o->object.type >= ELEMENTSOF(table) || table[o->object.type] <= 0)
461 return sizeof(ObjectHeader);
462
463 return table[o->object.type];
464}
465
78519831 466int journal_file_move_to_object(JournalFile *f, ObjectType type, uint64_t offset, Object **ret) {
cec736d2
LP
467 int r;
468 void *t;
469 Object *o;
470 uint64_t s;
471
472 assert(f);
473 assert(ret);
474
db11ac1a
LP
475 /* Objects may only be located at multiple of 64 bit */
476 if (!VALID64(offset))
477 return -EFAULT;
478
7a9dabea 479 r = journal_file_move_to(f, type, false, offset, sizeof(ObjectHeader), &t);
cec736d2
LP
480 if (r < 0)
481 return r;
482
483 o = (Object*) t;
484 s = le64toh(o->object.size);
485
486 if (s < sizeof(ObjectHeader))
487 return -EBADMSG;
488
16e9f408
LP
489 if (o->object.type <= OBJECT_UNUSED)
490 return -EBADMSG;
491
492 if (s < minimum_header_size(o))
493 return -EBADMSG;
494
d05089d8 495 if (type > OBJECT_UNUSED && o->object.type != type)
cec736d2
LP
496 return -EBADMSG;
497
498 if (s > sizeof(ObjectHeader)) {
7a9dabea 499 r = journal_file_move_to(f, type, false, offset, s, &t);
cec736d2
LP
500 if (r < 0)
501 return r;
502
503 o = (Object*) t;
504 }
505
cec736d2
LP
506 *ret = o;
507 return 0;
508}
509
d98cc1f2 510static uint64_t journal_file_entry_seqnum(JournalFile *f, uint64_t *seqnum) {
cec736d2
LP
511 uint64_t r;
512
513 assert(f);
514
beec0085 515 r = le64toh(f->header->tail_entry_seqnum) + 1;
c2373f84
LP
516
517 if (seqnum) {
de190aef 518 /* If an external seqnum counter was passed, we update
c2373f84
LP
519 * both the local and the external one, and set it to
520 * the maximum of both */
521
522 if (*seqnum + 1 > r)
523 r = *seqnum + 1;
524
525 *seqnum = r;
526 }
527
beec0085 528 f->header->tail_entry_seqnum = htole64(r);
cec736d2 529
beec0085
LP
530 if (f->header->head_entry_seqnum == 0)
531 f->header->head_entry_seqnum = htole64(r);
de190aef 532
cec736d2
LP
533 return r;
534}
535
78519831 536int journal_file_append_object(JournalFile *f, ObjectType type, uint64_t size, Object **ret, uint64_t *offset) {
cec736d2
LP
537 int r;
538 uint64_t p;
539 Object *tail, *o;
540 void *t;
541
542 assert(f);
d05089d8 543 assert(type > OBJECT_UNUSED && type < _OBJECT_TYPE_MAX);
cec736d2
LP
544 assert(size >= sizeof(ObjectHeader));
545 assert(offset);
546 assert(ret);
547
26687bf8
OS
548 r = journal_file_set_online(f);
549 if (r < 0)
550 return r;
551
cec736d2 552 p = le64toh(f->header->tail_object_offset);
cec736d2 553 if (p == 0)
23b0b2b2 554 p = le64toh(f->header->header_size);
cec736d2 555 else {
d05089d8 556 r = journal_file_move_to_object(f, OBJECT_UNUSED, p, &tail);
cec736d2
LP
557 if (r < 0)
558 return r;
559
560 p += ALIGN64(le64toh(tail->object.size));
561 }
562
563 r = journal_file_allocate(f, p, size);
564 if (r < 0)
565 return r;
566
fcde2389 567 r = journal_file_move_to(f, type, false, p, size, &t);
cec736d2
LP
568 if (r < 0)
569 return r;
570
571 o = (Object*) t;
572
573 zero(o->object);
de190aef 574 o->object.type = type;
cec736d2
LP
575 o->object.size = htole64(size);
576
577 f->header->tail_object_offset = htole64(p);
cec736d2
LP
578 f->header->n_objects = htole64(le64toh(f->header->n_objects) + 1);
579
580 *ret = o;
581 *offset = p;
582
583 return 0;
584}
585
de190aef 586static int journal_file_setup_data_hash_table(JournalFile *f) {
cec736d2
LP
587 uint64_t s, p;
588 Object *o;
589 int r;
590
591 assert(f);
592
dfabe643 593 /* We estimate that we need 1 hash table entry per 768 of
4a92baf3
LP
594 journal file and we want to make sure we never get beyond
595 75% fill level. Calculate the hash table size for the
596 maximum file size based on these metrics. */
597
dfabe643 598 s = (f->metrics.max_size * 4 / 768 / 3) * sizeof(HashItem);
4a92baf3
LP
599 if (s < DEFAULT_DATA_HASH_TABLE_SIZE)
600 s = DEFAULT_DATA_HASH_TABLE_SIZE;
601
507f22bd 602 log_debug("Reserving %"PRIu64" entries in hash table.", s / sizeof(HashItem));
4a92baf3 603
de190aef
LP
604 r = journal_file_append_object(f,
605 OBJECT_DATA_HASH_TABLE,
606 offsetof(Object, hash_table.items) + s,
607 &o, &p);
cec736d2
LP
608 if (r < 0)
609 return r;
610
29804cc1 611 memzero(o->hash_table.items, s);
cec736d2 612
de190aef
LP
613 f->header->data_hash_table_offset = htole64(p + offsetof(Object, hash_table.items));
614 f->header->data_hash_table_size = htole64(s);
cec736d2
LP
615
616 return 0;
617}
618
de190aef 619static int journal_file_setup_field_hash_table(JournalFile *f) {
cec736d2
LP
620 uint64_t s, p;
621 Object *o;
622 int r;
623
624 assert(f);
625
3c1668da
LP
626 /* We use a fixed size hash table for the fields as this
627 * number should grow very slowly only */
628
de190aef
LP
629 s = DEFAULT_FIELD_HASH_TABLE_SIZE;
630 r = journal_file_append_object(f,
631 OBJECT_FIELD_HASH_TABLE,
632 offsetof(Object, hash_table.items) + s,
633 &o, &p);
cec736d2
LP
634 if (r < 0)
635 return r;
636
29804cc1 637 memzero(o->hash_table.items, s);
cec736d2 638
de190aef
LP
639 f->header->field_hash_table_offset = htole64(p + offsetof(Object, hash_table.items));
640 f->header->field_hash_table_size = htole64(s);
cec736d2
LP
641
642 return 0;
643}
644
de190aef 645static int journal_file_map_data_hash_table(JournalFile *f) {
cec736d2
LP
646 uint64_t s, p;
647 void *t;
648 int r;
649
650 assert(f);
651
de190aef
LP
652 p = le64toh(f->header->data_hash_table_offset);
653 s = le64toh(f->header->data_hash_table_size);
cec736d2 654
de190aef 655 r = journal_file_move_to(f,
16e9f408 656 OBJECT_DATA_HASH_TABLE,
fcde2389 657 true,
de190aef
LP
658 p, s,
659 &t);
cec736d2
LP
660 if (r < 0)
661 return r;
662
de190aef 663 f->data_hash_table = t;
cec736d2
LP
664 return 0;
665}
666
de190aef 667static int journal_file_map_field_hash_table(JournalFile *f) {
cec736d2
LP
668 uint64_t s, p;
669 void *t;
670 int r;
671
672 assert(f);
673
de190aef
LP
674 p = le64toh(f->header->field_hash_table_offset);
675 s = le64toh(f->header->field_hash_table_size);
cec736d2 676
de190aef 677 r = journal_file_move_to(f,
16e9f408 678 OBJECT_FIELD_HASH_TABLE,
fcde2389 679 true,
de190aef
LP
680 p, s,
681 &t);
cec736d2
LP
682 if (r < 0)
683 return r;
684
de190aef 685 f->field_hash_table = t;
cec736d2
LP
686 return 0;
687}
688
3c1668da
LP
689static int journal_file_link_field(
690 JournalFile *f,
691 Object *o,
692 uint64_t offset,
693 uint64_t hash) {
694
805d1486 695 uint64_t p, h, m;
3c1668da
LP
696 int r;
697
698 assert(f);
699 assert(o);
700 assert(offset > 0);
701
702 if (o->object.type != OBJECT_FIELD)
703 return -EINVAL;
704
805d1486
LP
705 m = le64toh(f->header->field_hash_table_size) / sizeof(HashItem);
706 if (m <= 0)
707 return -EBADMSG;
3c1668da 708
805d1486 709 /* This might alter the window we are looking at */
3c1668da
LP
710 o->field.next_hash_offset = o->field.head_data_offset = 0;
711
805d1486 712 h = hash % m;
3c1668da
LP
713 p = le64toh(f->field_hash_table[h].tail_hash_offset);
714 if (p == 0)
715 f->field_hash_table[h].head_hash_offset = htole64(offset);
716 else {
717 r = journal_file_move_to_object(f, OBJECT_FIELD, p, &o);
718 if (r < 0)
719 return r;
720
721 o->field.next_hash_offset = htole64(offset);
722 }
723
724 f->field_hash_table[h].tail_hash_offset = htole64(offset);
725
726 if (JOURNAL_HEADER_CONTAINS(f->header, n_fields))
727 f->header->n_fields = htole64(le64toh(f->header->n_fields) + 1);
728
729 return 0;
730}
731
732static int journal_file_link_data(
733 JournalFile *f,
734 Object *o,
735 uint64_t offset,
736 uint64_t hash) {
737
805d1486 738 uint64_t p, h, m;
cec736d2
LP
739 int r;
740
741 assert(f);
742 assert(o);
743 assert(offset > 0);
b588975f
LP
744
745 if (o->object.type != OBJECT_DATA)
746 return -EINVAL;
cec736d2 747
805d1486
LP
748 m = le64toh(f->header->data_hash_table_size) / sizeof(HashItem);
749 if (m <= 0)
750 return -EBADMSG;
48496df6 751
805d1486 752 /* This might alter the window we are looking at */
de190aef
LP
753 o->data.next_hash_offset = o->data.next_field_offset = 0;
754 o->data.entry_offset = o->data.entry_array_offset = 0;
755 o->data.n_entries = 0;
cec736d2 756
805d1486 757 h = hash % m;
8db4213e 758 p = le64toh(f->data_hash_table[h].tail_hash_offset);
3c1668da 759 if (p == 0)
cec736d2 760 /* Only entry in the hash table is easy */
de190aef 761 f->data_hash_table[h].head_hash_offset = htole64(offset);
3c1668da 762 else {
48496df6
LP
763 /* Move back to the previous data object, to patch in
764 * pointer */
cec736d2 765
de190aef 766 r = journal_file_move_to_object(f, OBJECT_DATA, p, &o);
cec736d2
LP
767 if (r < 0)
768 return r;
769
de190aef 770 o->data.next_hash_offset = htole64(offset);
cec736d2
LP
771 }
772
de190aef 773 f->data_hash_table[h].tail_hash_offset = htole64(offset);
cec736d2 774
dca6219e
LP
775 if (JOURNAL_HEADER_CONTAINS(f->header, n_data))
776 f->header->n_data = htole64(le64toh(f->header->n_data) + 1);
777
cec736d2
LP
778 return 0;
779}
780
3c1668da
LP
781int journal_file_find_field_object_with_hash(
782 JournalFile *f,
783 const void *field, uint64_t size, uint64_t hash,
784 Object **ret, uint64_t *offset) {
785
805d1486 786 uint64_t p, osize, h, m;
3c1668da
LP
787 int r;
788
789 assert(f);
790 assert(field && size > 0);
791
792 osize = offsetof(Object, field.payload) + size;
793
805d1486
LP
794 m = le64toh(f->header->field_hash_table_size) / sizeof(HashItem);
795
796 if (m <= 0)
3c1668da
LP
797 return -EBADMSG;
798
805d1486 799 h = hash % m;
3c1668da
LP
800 p = le64toh(f->field_hash_table[h].head_hash_offset);
801
802 while (p > 0) {
803 Object *o;
804
805 r = journal_file_move_to_object(f, OBJECT_FIELD, p, &o);
806 if (r < 0)
807 return r;
808
809 if (le64toh(o->field.hash) == hash &&
810 le64toh(o->object.size) == osize &&
811 memcmp(o->field.payload, field, size) == 0) {
812
813 if (ret)
814 *ret = o;
815 if (offset)
816 *offset = p;
817
818 return 1;
819 }
820
821 p = le64toh(o->field.next_hash_offset);
822 }
823
824 return 0;
825}
826
827int journal_file_find_field_object(
828 JournalFile *f,
829 const void *field, uint64_t size,
830 Object **ret, uint64_t *offset) {
831
832 uint64_t hash;
833
834 assert(f);
835 assert(field && size > 0);
836
837 hash = hash64(field, size);
838
839 return journal_file_find_field_object_with_hash(f,
840 field, size, hash,
841 ret, offset);
842}
843
de190aef
LP
844int journal_file_find_data_object_with_hash(
845 JournalFile *f,
846 const void *data, uint64_t size, uint64_t hash,
847 Object **ret, uint64_t *offset) {
48496df6 848
805d1486 849 uint64_t p, osize, h, m;
cec736d2
LP
850 int r;
851
852 assert(f);
853 assert(data || size == 0);
854
855 osize = offsetof(Object, data.payload) + size;
856
805d1486
LP
857 m = le64toh(f->header->data_hash_table_size) / sizeof(HashItem);
858 if (m <= 0)
bc85bfee
LP
859 return -EBADMSG;
860
805d1486 861 h = hash % m;
de190aef 862 p = le64toh(f->data_hash_table[h].head_hash_offset);
cec736d2 863
de190aef
LP
864 while (p > 0) {
865 Object *o;
cec736d2 866
de190aef 867 r = journal_file_move_to_object(f, OBJECT_DATA, p, &o);
cec736d2
LP
868 if (r < 0)
869 return r;
870
807e17f0 871 if (le64toh(o->data.hash) != hash)
85a131e8 872 goto next;
807e17f0 873
d89c8fdf 874 if (o->object.flags & OBJECT_COMPRESSION_MASK) {
3b1a55e1 875#if defined(HAVE_XZ) || defined(HAVE_LZ4)
fa1c4b51
ZJS
876 uint64_t l;
877 size_t rsize;
cec736d2 878
807e17f0
LP
879 l = le64toh(o->object.size);
880 if (l <= offsetof(Object, data.payload))
cec736d2
LP
881 return -EBADMSG;
882
807e17f0
LP
883 l -= offsetof(Object, data.payload);
884
d89c8fdf
ZJS
885 r = decompress_blob(o->object.flags & OBJECT_COMPRESSION_MASK,
886 o->data.payload, l, &f->compress_buffer, &f->compress_buffer_size, &rsize, 0);
887 if (r < 0)
888 return r;
807e17f0 889
b785c858 890 if (rsize == size &&
807e17f0
LP
891 memcmp(f->compress_buffer, data, size) == 0) {
892
893 if (ret)
894 *ret = o;
895
896 if (offset)
897 *offset = p;
898
899 return 1;
900 }
3b1a55e1
ZJS
901#else
902 return -EPROTONOSUPPORT;
903#endif
807e17f0
LP
904 } else if (le64toh(o->object.size) == osize &&
905 memcmp(o->data.payload, data, size) == 0) {
906
cec736d2
LP
907 if (ret)
908 *ret = o;
909
910 if (offset)
911 *offset = p;
912
de190aef 913 return 1;
cec736d2
LP
914 }
915
85a131e8 916 next:
cec736d2
LP
917 p = le64toh(o->data.next_hash_offset);
918 }
919
de190aef
LP
920 return 0;
921}
922
923int journal_file_find_data_object(
924 JournalFile *f,
925 const void *data, uint64_t size,
926 Object **ret, uint64_t *offset) {
927
928 uint64_t hash;
929
930 assert(f);
931 assert(data || size == 0);
932
933 hash = hash64(data, size);
934
935 return journal_file_find_data_object_with_hash(f,
936 data, size, hash,
937 ret, offset);
938}
939
3c1668da
LP
940static int journal_file_append_field(
941 JournalFile *f,
942 const void *field, uint64_t size,
943 Object **ret, uint64_t *offset) {
944
945 uint64_t hash, p;
946 uint64_t osize;
947 Object *o;
948 int r;
949
950 assert(f);
951 assert(field && size > 0);
952
953 hash = hash64(field, size);
954
955 r = journal_file_find_field_object_with_hash(f, field, size, hash, &o, &p);
956 if (r < 0)
957 return r;
958 else if (r > 0) {
959
960 if (ret)
961 *ret = o;
962
963 if (offset)
964 *offset = p;
965
966 return 0;
967 }
968
969 osize = offsetof(Object, field.payload) + size;
970 r = journal_file_append_object(f, OBJECT_FIELD, osize, &o, &p);
8c92d4bb
LP
971 if (r < 0)
972 return r;
3c1668da
LP
973
974 o->field.hash = htole64(hash);
975 memcpy(o->field.payload, field, size);
976
977 r = journal_file_link_field(f, o, p, hash);
978 if (r < 0)
979 return r;
980
981 /* The linking might have altered the window, so let's
982 * refresh our pointer */
983 r = journal_file_move_to_object(f, OBJECT_FIELD, p, &o);
984 if (r < 0)
985 return r;
986
987#ifdef HAVE_GCRYPT
988 r = journal_file_hmac_put_object(f, OBJECT_FIELD, o, p);
989 if (r < 0)
990 return r;
991#endif
992
993 if (ret)
994 *ret = o;
995
996 if (offset)
997 *offset = p;
998
999 return 0;
1000}
1001
48496df6
LP
1002static int journal_file_append_data(
1003 JournalFile *f,
1004 const void *data, uint64_t size,
1005 Object **ret, uint64_t *offset) {
1006
de190aef
LP
1007 uint64_t hash, p;
1008 uint64_t osize;
1009 Object *o;
d89c8fdf 1010 int r, compression = 0;
3c1668da 1011 const void *eq;
de190aef
LP
1012
1013 assert(f);
1014 assert(data || size == 0);
1015
1016 hash = hash64(data, size);
1017
1018 r = journal_file_find_data_object_with_hash(f, data, size, hash, &o, &p);
1019 if (r < 0)
1020 return r;
1021 else if (r > 0) {
1022
1023 if (ret)
1024 *ret = o;
1025
1026 if (offset)
1027 *offset = p;
1028
1029 return 0;
1030 }
1031
1032 osize = offsetof(Object, data.payload) + size;
1033 r = journal_file_append_object(f, OBJECT_DATA, osize, &o, &p);
cec736d2
LP
1034 if (r < 0)
1035 return r;
1036
cec736d2 1037 o->data.hash = htole64(hash);
807e17f0 1038
d89c8fdf
ZJS
1039#if defined(HAVE_XZ) || defined(HAVE_LZ4)
1040 if (f->compress_xz &&
807e17f0 1041 size >= COMPRESSION_SIZE_THRESHOLD) {
fa1c4b51 1042 size_t rsize;
807e17f0 1043
d89c8fdf 1044 compression = compress_blob(data, size, o->data.payload, &rsize);
807e17f0 1045
d89c8fdf 1046 if (compression) {
807e17f0 1047 o->object.size = htole64(offsetof(Object, data.payload) + rsize);
d89c8fdf 1048 o->object.flags |= compression;
807e17f0 1049
fa1c4b51 1050 log_debug("Compressed data object %"PRIu64" -> %zu using %s",
d89c8fdf 1051 size, rsize, object_compressed_to_string(compression));
807e17f0
LP
1052 }
1053 }
1054#endif
1055
d89c8fdf 1056 if (!compression && size > 0)
807e17f0 1057 memcpy(o->data.payload, data, size);
cec736d2 1058
de190aef 1059 r = journal_file_link_data(f, o, p, hash);
cec736d2
LP
1060 if (r < 0)
1061 return r;
1062
48496df6
LP
1063 /* The linking might have altered the window, so let's
1064 * refresh our pointer */
1065 r = journal_file_move_to_object(f, OBJECT_DATA, p, &o);
1066 if (r < 0)
1067 return r;
1068
08c6f819
SL
1069 if (!data)
1070 eq = NULL;
1071 else
1072 eq = memchr(data, '=', size);
3c1668da 1073 if (eq && eq > data) {
748db592 1074 Object *fo = NULL;
3c1668da 1075 uint64_t fp;
3c1668da
LP
1076
1077 /* Create field object ... */
1078 r = journal_file_append_field(f, data, (uint8_t*) eq - (uint8_t*) data, &fo, &fp);
1079 if (r < 0)
1080 return r;
1081
1082 /* ... and link it in. */
1083 o->data.next_field_offset = fo->field.head_data_offset;
1084 fo->field.head_data_offset = le64toh(p);
1085 }
1086
5996c7c2
LP
1087#ifdef HAVE_GCRYPT
1088 r = journal_file_hmac_put_object(f, OBJECT_DATA, o, p);
1089 if (r < 0)
1090 return r;
1091#endif
1092
cec736d2
LP
1093 if (ret)
1094 *ret = o;
1095
1096 if (offset)
de190aef 1097 *offset = p;
cec736d2
LP
1098
1099 return 0;
1100}
1101
1102uint64_t journal_file_entry_n_items(Object *o) {
1103 assert(o);
b588975f
LP
1104
1105 if (o->object.type != OBJECT_ENTRY)
1106 return 0;
cec736d2
LP
1107
1108 return (le64toh(o->object.size) - offsetof(Object, entry.items)) / sizeof(EntryItem);
1109}
1110
0284adc6 1111uint64_t journal_file_entry_array_n_items(Object *o) {
de190aef 1112 assert(o);
b588975f
LP
1113
1114 if (o->object.type != OBJECT_ENTRY_ARRAY)
1115 return 0;
de190aef
LP
1116
1117 return (le64toh(o->object.size) - offsetof(Object, entry_array.items)) / sizeof(uint64_t);
1118}
1119
fb9a24b6
LP
1120uint64_t journal_file_hash_table_n_items(Object *o) {
1121 assert(o);
b588975f
LP
1122
1123 if (o->object.type != OBJECT_DATA_HASH_TABLE &&
1124 o->object.type != OBJECT_FIELD_HASH_TABLE)
1125 return 0;
fb9a24b6
LP
1126
1127 return (le64toh(o->object.size) - offsetof(Object, hash_table.items)) / sizeof(HashItem);
1128}
1129
de190aef 1130static int link_entry_into_array(JournalFile *f,
4fd052ae
FC
1131 le64_t *first,
1132 le64_t *idx,
de190aef 1133 uint64_t p) {
cec736d2 1134 int r;
de190aef
LP
1135 uint64_t n = 0, ap = 0, q, i, a, hidx;
1136 Object *o;
1137
cec736d2 1138 assert(f);
de190aef
LP
1139 assert(first);
1140 assert(idx);
1141 assert(p > 0);
cec736d2 1142
de190aef
LP
1143 a = le64toh(*first);
1144 i = hidx = le64toh(*idx);
1145 while (a > 0) {
1146
1147 r = journal_file_move_to_object(f, OBJECT_ENTRY_ARRAY, a, &o);
1148 if (r < 0)
1149 return r;
cec736d2 1150
de190aef
LP
1151 n = journal_file_entry_array_n_items(o);
1152 if (i < n) {
1153 o->entry_array.items[i] = htole64(p);
1154 *idx = htole64(hidx + 1);
1155 return 0;
1156 }
cec736d2 1157
de190aef
LP
1158 i -= n;
1159 ap = a;
1160 a = le64toh(o->entry_array.next_entry_array_offset);
1161 }
1162
1163 if (hidx > n)
1164 n = (hidx+1) * 2;
1165 else
1166 n = n * 2;
1167
1168 if (n < 4)
1169 n = 4;
1170
1171 r = journal_file_append_object(f, OBJECT_ENTRY_ARRAY,
1172 offsetof(Object, entry_array.items) + n * sizeof(uint64_t),
1173 &o, &q);
cec736d2
LP
1174 if (r < 0)
1175 return r;
1176
feb12d3e 1177#ifdef HAVE_GCRYPT
5996c7c2 1178 r = journal_file_hmac_put_object(f, OBJECT_ENTRY_ARRAY, o, q);
b0af6f41
LP
1179 if (r < 0)
1180 return r;
feb12d3e 1181#endif
b0af6f41 1182
de190aef 1183 o->entry_array.items[i] = htole64(p);
cec736d2 1184
de190aef 1185 if (ap == 0)
7be3aa17 1186 *first = htole64(q);
cec736d2 1187 else {
de190aef 1188 r = journal_file_move_to_object(f, OBJECT_ENTRY_ARRAY, ap, &o);
cec736d2
LP
1189 if (r < 0)
1190 return r;
1191
de190aef
LP
1192 o->entry_array.next_entry_array_offset = htole64(q);
1193 }
cec736d2 1194
2dee23eb
LP
1195 if (JOURNAL_HEADER_CONTAINS(f->header, n_entry_arrays))
1196 f->header->n_entry_arrays = htole64(le64toh(f->header->n_entry_arrays) + 1);
1197
de190aef
LP
1198 *idx = htole64(hidx + 1);
1199
1200 return 0;
1201}
cec736d2 1202
de190aef 1203static int link_entry_into_array_plus_one(JournalFile *f,
4fd052ae
FC
1204 le64_t *extra,
1205 le64_t *first,
1206 le64_t *idx,
de190aef
LP
1207 uint64_t p) {
1208
1209 int r;
1210
1211 assert(f);
1212 assert(extra);
1213 assert(first);
1214 assert(idx);
1215 assert(p > 0);
1216
1217 if (*idx == 0)
1218 *extra = htole64(p);
1219 else {
4fd052ae 1220 le64_t i;
de190aef 1221
7be3aa17 1222 i = htole64(le64toh(*idx) - 1);
de190aef
LP
1223 r = link_entry_into_array(f, first, &i, p);
1224 if (r < 0)
1225 return r;
cec736d2
LP
1226 }
1227
de190aef
LP
1228 *idx = htole64(le64toh(*idx) + 1);
1229 return 0;
1230}
1231
1232static int journal_file_link_entry_item(JournalFile *f, Object *o, uint64_t offset, uint64_t i) {
1233 uint64_t p;
1234 int r;
1235 assert(f);
1236 assert(o);
1237 assert(offset > 0);
1238
1239 p = le64toh(o->entry.items[i].object_offset);
1240 if (p == 0)
1241 return -EINVAL;
1242
1243 r = journal_file_move_to_object(f, OBJECT_DATA, p, &o);
cec736d2
LP
1244 if (r < 0)
1245 return r;
1246
de190aef
LP
1247 return link_entry_into_array_plus_one(f,
1248 &o->data.entry_offset,
1249 &o->data.entry_array_offset,
1250 &o->data.n_entries,
1251 offset);
cec736d2
LP
1252}
1253
1254static int journal_file_link_entry(JournalFile *f, Object *o, uint64_t offset) {
de190aef 1255 uint64_t n, i;
cec736d2
LP
1256 int r;
1257
1258 assert(f);
1259 assert(o);
1260 assert(offset > 0);
b588975f
LP
1261
1262 if (o->object.type != OBJECT_ENTRY)
1263 return -EINVAL;
cec736d2 1264
b788cc23
LP
1265 __sync_synchronize();
1266
cec736d2 1267 /* Link up the entry itself */
de190aef
LP
1268 r = link_entry_into_array(f,
1269 &f->header->entry_array_offset,
1270 &f->header->n_entries,
1271 offset);
1272 if (r < 0)
1273 return r;
cec736d2 1274
507f22bd 1275 /* log_debug("=> %s seqnr=%"PRIu64" n_entries=%"PRIu64, f->path, o->entry.seqnum, f->header->n_entries); */
cec736d2 1276
de190aef 1277 if (f->header->head_entry_realtime == 0)
0ac38b70 1278 f->header->head_entry_realtime = o->entry.realtime;
cec736d2 1279
0ac38b70 1280 f->header->tail_entry_realtime = o->entry.realtime;
de190aef
LP
1281 f->header->tail_entry_monotonic = o->entry.monotonic;
1282
1283 f->tail_entry_monotonic_valid = true;
cec736d2
LP
1284
1285 /* Link up the items */
1286 n = journal_file_entry_n_items(o);
1287 for (i = 0; i < n; i++) {
1288 r = journal_file_link_entry_item(f, o, offset, i);
1289 if (r < 0)
1290 return r;
1291 }
1292
cec736d2
LP
1293 return 0;
1294}
1295
1296static int journal_file_append_entry_internal(
1297 JournalFile *f,
1298 const dual_timestamp *ts,
1299 uint64_t xor_hash,
1300 const EntryItem items[], unsigned n_items,
de190aef 1301 uint64_t *seqnum,
cec736d2
LP
1302 Object **ret, uint64_t *offset) {
1303 uint64_t np;
1304 uint64_t osize;
1305 Object *o;
1306 int r;
1307
1308 assert(f);
1309 assert(items || n_items == 0);
de190aef 1310 assert(ts);
cec736d2
LP
1311
1312 osize = offsetof(Object, entry.items) + (n_items * sizeof(EntryItem));
1313
de190aef 1314 r = journal_file_append_object(f, OBJECT_ENTRY, osize, &o, &np);
cec736d2
LP
1315 if (r < 0)
1316 return r;
1317
d98cc1f2 1318 o->entry.seqnum = htole64(journal_file_entry_seqnum(f, seqnum));
cec736d2 1319 memcpy(o->entry.items, items, n_items * sizeof(EntryItem));
de190aef
LP
1320 o->entry.realtime = htole64(ts->realtime);
1321 o->entry.monotonic = htole64(ts->monotonic);
cec736d2
LP
1322 o->entry.xor_hash = htole64(xor_hash);
1323 o->entry.boot_id = f->header->boot_id;
1324
feb12d3e 1325#ifdef HAVE_GCRYPT
5996c7c2 1326 r = journal_file_hmac_put_object(f, OBJECT_ENTRY, o, np);
b0af6f41
LP
1327 if (r < 0)
1328 return r;
feb12d3e 1329#endif
b0af6f41 1330
cec736d2
LP
1331 r = journal_file_link_entry(f, o, np);
1332 if (r < 0)
1333 return r;
1334
1335 if (ret)
1336 *ret = o;
1337
1338 if (offset)
1339 *offset = np;
1340
1341 return 0;
1342}
1343
cf244689 1344void journal_file_post_change(JournalFile *f) {
50f20cfd
LP
1345 assert(f);
1346
1347 /* inotify() does not receive IN_MODIFY events from file
1348 * accesses done via mmap(). After each access we hence
1349 * trigger IN_MODIFY by truncating the journal file to its
1350 * current size which triggers IN_MODIFY. */
1351
bc85bfee
LP
1352 __sync_synchronize();
1353
50f20cfd 1354 if (ftruncate(f->fd, f->last_stat.st_size) < 0)
56f64d95 1355 log_error_errno(errno, "Failed to truncate file to its own size: %m");
50f20cfd
LP
1356}
1357
1f2da9ec
LP
1358static int entry_item_cmp(const void *_a, const void *_b) {
1359 const EntryItem *a = _a, *b = _b;
1360
1361 if (le64toh(a->object_offset) < le64toh(b->object_offset))
1362 return -1;
1363 if (le64toh(a->object_offset) > le64toh(b->object_offset))
1364 return 1;
1365 return 0;
1366}
1367
de190aef 1368int journal_file_append_entry(JournalFile *f, const dual_timestamp *ts, const struct iovec iovec[], unsigned n_iovec, uint64_t *seqnum, Object **ret, uint64_t *offset) {
cec736d2
LP
1369 unsigned i;
1370 EntryItem *items;
1371 int r;
1372 uint64_t xor_hash = 0;
de190aef 1373 struct dual_timestamp _ts;
cec736d2
LP
1374
1375 assert(f);
1376 assert(iovec || n_iovec == 0);
1377
de190aef
LP
1378 if (!ts) {
1379 dual_timestamp_get(&_ts);
1380 ts = &_ts;
1381 }
1382
1383 if (f->tail_entry_monotonic_valid &&
1384 ts->monotonic < le64toh(f->header->tail_entry_monotonic))
1385 return -EINVAL;
1386
feb12d3e 1387#ifdef HAVE_GCRYPT
7560fffc
LP
1388 r = journal_file_maybe_append_tag(f, ts->realtime);
1389 if (r < 0)
1390 return r;
feb12d3e 1391#endif
7560fffc 1392
64825d3c 1393 /* alloca() can't take 0, hence let's allocate at least one */
9607d947 1394 items = alloca(sizeof(EntryItem) * MAX(1u, n_iovec));
cec736d2
LP
1395
1396 for (i = 0; i < n_iovec; i++) {
1397 uint64_t p;
1398 Object *o;
1399
1400 r = journal_file_append_data(f, iovec[i].iov_base, iovec[i].iov_len, &o, &p);
1401 if (r < 0)
cf244689 1402 return r;
cec736d2
LP
1403
1404 xor_hash ^= le64toh(o->data.hash);
1405 items[i].object_offset = htole64(p);
de7b95cd 1406 items[i].hash = o->data.hash;
cec736d2
LP
1407 }
1408
1f2da9ec
LP
1409 /* Order by the position on disk, in order to improve seek
1410 * times for rotating media. */
7ff7394d 1411 qsort_safe(items, n_iovec, sizeof(EntryItem), entry_item_cmp);
1f2da9ec 1412
de190aef 1413 r = journal_file_append_entry_internal(f, ts, xor_hash, items, n_iovec, seqnum, ret, offset);
cec736d2 1414
fa6ac760
LP
1415 /* If the memory mapping triggered a SIGBUS then we return an
1416 * IO error and ignore the error code passed down to us, since
1417 * it is very likely just an effect of a nullified replacement
1418 * mapping page */
1419
1420 if (mmap_cache_got_sigbus(f->mmap, f->fd))
1421 r = -EIO;
1422
50f20cfd
LP
1423 journal_file_post_change(f);
1424
cec736d2
LP
1425 return r;
1426}
1427
a4bcff5b 1428typedef struct ChainCacheItem {
fb099c8d 1429 uint64_t first; /* the array at the beginning of the chain */
a4bcff5b
LP
1430 uint64_t array; /* the cached array */
1431 uint64_t begin; /* the first item in the cached array */
1432 uint64_t total; /* the total number of items in all arrays before this one in the chain */
f268980d 1433 uint64_t last_index; /* the last index we looked at, to optimize locality when bisecting */
a4bcff5b
LP
1434} ChainCacheItem;
1435
1436static void chain_cache_put(
4743015d 1437 OrderedHashmap *h,
a4bcff5b
LP
1438 ChainCacheItem *ci,
1439 uint64_t first,
1440 uint64_t array,
1441 uint64_t begin,
f268980d
LP
1442 uint64_t total,
1443 uint64_t last_index) {
a4bcff5b
LP
1444
1445 if (!ci) {
34741aa3
LP
1446 /* If the chain item to cache for this chain is the
1447 * first one it's not worth caching anything */
1448 if (array == first)
1449 return;
1450
29433089 1451 if (ordered_hashmap_size(h) >= CHAIN_CACHE_MAX) {
4743015d 1452 ci = ordered_hashmap_steal_first(h);
29433089
LP
1453 assert(ci);
1454 } else {
a4bcff5b
LP
1455 ci = new(ChainCacheItem, 1);
1456 if (!ci)
1457 return;
1458 }
1459
1460 ci->first = first;
1461
4743015d 1462 if (ordered_hashmap_put(h, &ci->first, ci) < 0) {
a4bcff5b
LP
1463 free(ci);
1464 return;
1465 }
1466 } else
1467 assert(ci->first == first);
1468
1469 ci->array = array;
1470 ci->begin = begin;
1471 ci->total = total;
f268980d 1472 ci->last_index = last_index;
a4bcff5b
LP
1473}
1474
f268980d
LP
1475static int generic_array_get(
1476 JournalFile *f,
1477 uint64_t first,
1478 uint64_t i,
1479 Object **ret, uint64_t *offset) {
de190aef 1480
cec736d2 1481 Object *o;
a4bcff5b 1482 uint64_t p = 0, a, t = 0;
cec736d2 1483 int r;
a4bcff5b 1484 ChainCacheItem *ci;
cec736d2
LP
1485
1486 assert(f);
1487
de190aef 1488 a = first;
a4bcff5b
LP
1489
1490 /* Try the chain cache first */
4743015d 1491 ci = ordered_hashmap_get(f->chain_cache, &first);
a4bcff5b
LP
1492 if (ci && i > ci->total) {
1493 a = ci->array;
1494 i -= ci->total;
1495 t = ci->total;
1496 }
1497
de190aef 1498 while (a > 0) {
a4bcff5b 1499 uint64_t k;
cec736d2 1500
de190aef
LP
1501 r = journal_file_move_to_object(f, OBJECT_ENTRY_ARRAY, a, &o);
1502 if (r < 0)
1503 return r;
cec736d2 1504
a4bcff5b
LP
1505 k = journal_file_entry_array_n_items(o);
1506 if (i < k) {
de190aef 1507 p = le64toh(o->entry_array.items[i]);
a4bcff5b 1508 goto found;
cec736d2
LP
1509 }
1510
a4bcff5b
LP
1511 i -= k;
1512 t += k;
de190aef
LP
1513 a = le64toh(o->entry_array.next_entry_array_offset);
1514 }
1515
a4bcff5b
LP
1516 return 0;
1517
1518found:
1519 /* Let's cache this item for the next invocation */
af13a6b0 1520 chain_cache_put(f->chain_cache, ci, first, a, le64toh(o->entry_array.items[0]), t, i);
de190aef
LP
1521
1522 r = journal_file_move_to_object(f, OBJECT_ENTRY, p, &o);
1523 if (r < 0)
1524 return r;
1525
1526 if (ret)
1527 *ret = o;
1528
1529 if (offset)
1530 *offset = p;
1531
1532 return 1;
1533}
1534
f268980d
LP
1535static int generic_array_get_plus_one(
1536 JournalFile *f,
1537 uint64_t extra,
1538 uint64_t first,
1539 uint64_t i,
1540 Object **ret, uint64_t *offset) {
de190aef
LP
1541
1542 Object *o;
1543
1544 assert(f);
1545
1546 if (i == 0) {
1547 int r;
1548
1549 r = journal_file_move_to_object(f, OBJECT_ENTRY, extra, &o);
cec736d2
LP
1550 if (r < 0)
1551 return r;
1552
de190aef
LP
1553 if (ret)
1554 *ret = o;
cec736d2 1555
de190aef
LP
1556 if (offset)
1557 *offset = extra;
cec736d2 1558
de190aef 1559 return 1;
cec736d2
LP
1560 }
1561
de190aef
LP
1562 return generic_array_get(f, first, i-1, ret, offset);
1563}
cec736d2 1564
de190aef
LP
1565enum {
1566 TEST_FOUND,
1567 TEST_LEFT,
1568 TEST_RIGHT
1569};
cec736d2 1570
f268980d
LP
1571static int generic_array_bisect(
1572 JournalFile *f,
1573 uint64_t first,
1574 uint64_t n,
1575 uint64_t needle,
1576 int (*test_object)(JournalFile *f, uint64_t p, uint64_t needle),
1577 direction_t direction,
1578 Object **ret,
1579 uint64_t *offset,
1580 uint64_t *idx) {
1581
1582 uint64_t a, p, t = 0, i = 0, last_p = 0, last_index = (uint64_t) -1;
de190aef
LP
1583 bool subtract_one = false;
1584 Object *o, *array = NULL;
1585 int r;
a4bcff5b 1586 ChainCacheItem *ci;
cec736d2 1587
de190aef
LP
1588 assert(f);
1589 assert(test_object);
cec736d2 1590
a4bcff5b 1591 /* Start with the first array in the chain */
de190aef 1592 a = first;
a4bcff5b 1593
4743015d 1594 ci = ordered_hashmap_get(f->chain_cache, &first);
a4bcff5b
LP
1595 if (ci && n > ci->total) {
1596 /* Ah, we have iterated this bisection array chain
1597 * previously! Let's see if we can skip ahead in the
1598 * chain, as far as the last time. But we can't jump
1599 * backwards in the chain, so let's check that
1600 * first. */
1601
1602 r = test_object(f, ci->begin, needle);
1603 if (r < 0)
1604 return r;
1605
1606 if (r == TEST_LEFT) {
f268980d 1607 /* OK, what we are looking for is right of the
a4bcff5b
LP
1608 * begin of this EntryArray, so let's jump
1609 * straight to previously cached array in the
1610 * chain */
1611
1612 a = ci->array;
1613 n -= ci->total;
1614 t = ci->total;
f268980d 1615 last_index = ci->last_index;
a4bcff5b
LP
1616 }
1617 }
1618
de190aef
LP
1619 while (a > 0) {
1620 uint64_t left, right, k, lp;
1621
1622 r = journal_file_move_to_object(f, OBJECT_ENTRY_ARRAY, a, &array);
cec736d2
LP
1623 if (r < 0)
1624 return r;
1625
de190aef
LP
1626 k = journal_file_entry_array_n_items(array);
1627 right = MIN(k, n);
1628 if (right <= 0)
1629 return 0;
cec736d2 1630
de190aef
LP
1631 i = right - 1;
1632 lp = p = le64toh(array->entry_array.items[i]);
1633 if (p <= 0)
1634 return -EBADMSG;
cec736d2 1635
de190aef
LP
1636 r = test_object(f, p, needle);
1637 if (r < 0)
1638 return r;
cec736d2 1639
de190aef
LP
1640 if (r == TEST_FOUND)
1641 r = direction == DIRECTION_DOWN ? TEST_RIGHT : TEST_LEFT;
1642
1643 if (r == TEST_RIGHT) {
1644 left = 0;
1645 right -= 1;
f268980d
LP
1646
1647 if (last_index != (uint64_t) -1) {
1648 assert(last_index <= right);
1649
1650 /* If we cached the last index we
1651 * looked at, let's try to not to jump
1652 * too wildly around and see if we can
1653 * limit the range to look at early to
1654 * the immediate neighbors of the last
1655 * index we looked at. */
1656
1657 if (last_index > 0) {
1658 uint64_t x = last_index - 1;
1659
1660 p = le64toh(array->entry_array.items[x]);
1661 if (p <= 0)
1662 return -EBADMSG;
1663
1664 r = test_object(f, p, needle);
1665 if (r < 0)
1666 return r;
1667
1668 if (r == TEST_FOUND)
1669 r = direction == DIRECTION_DOWN ? TEST_RIGHT : TEST_LEFT;
1670
1671 if (r == TEST_RIGHT)
1672 right = x;
1673 else
1674 left = x + 1;
1675 }
1676
1677 if (last_index < right) {
1678 uint64_t y = last_index + 1;
1679
1680 p = le64toh(array->entry_array.items[y]);
1681 if (p <= 0)
1682 return -EBADMSG;
1683
1684 r = test_object(f, p, needle);
1685 if (r < 0)
1686 return r;
1687
1688 if (r == TEST_FOUND)
1689 r = direction == DIRECTION_DOWN ? TEST_RIGHT : TEST_LEFT;
1690
1691 if (r == TEST_RIGHT)
1692 right = y;
1693 else
1694 left = y + 1;
1695 }
f268980d
LP
1696 }
1697
de190aef
LP
1698 for (;;) {
1699 if (left == right) {
1700 if (direction == DIRECTION_UP)
1701 subtract_one = true;
1702
1703 i = left;
1704 goto found;
1705 }
1706
1707 assert(left < right);
de190aef 1708 i = (left + right) / 2;
f268980d 1709
de190aef
LP
1710 p = le64toh(array->entry_array.items[i]);
1711 if (p <= 0)
1712 return -EBADMSG;
1713
1714 r = test_object(f, p, needle);
1715 if (r < 0)
1716 return r;
cec736d2 1717
de190aef
LP
1718 if (r == TEST_FOUND)
1719 r = direction == DIRECTION_DOWN ? TEST_RIGHT : TEST_LEFT;
1720
1721 if (r == TEST_RIGHT)
1722 right = i;
1723 else
1724 left = i + 1;
1725 }
1726 }
1727
2173cbf8 1728 if (k >= n) {
cbdca852
LP
1729 if (direction == DIRECTION_UP) {
1730 i = n;
1731 subtract_one = true;
1732 goto found;
1733 }
1734
cec736d2 1735 return 0;
cbdca852 1736 }
cec736d2 1737
de190aef
LP
1738 last_p = lp;
1739
1740 n -= k;
1741 t += k;
f268980d 1742 last_index = (uint64_t) -1;
de190aef 1743 a = le64toh(array->entry_array.next_entry_array_offset);
cec736d2
LP
1744 }
1745
1746 return 0;
de190aef
LP
1747
1748found:
1749 if (subtract_one && t == 0 && i == 0)
1750 return 0;
1751
a4bcff5b 1752 /* Let's cache this item for the next invocation */
af13a6b0 1753 chain_cache_put(f->chain_cache, ci, first, a, le64toh(array->entry_array.items[0]), t, subtract_one ? (i > 0 ? i-1 : (uint64_t) -1) : i);
a4bcff5b 1754
de190aef
LP
1755 if (subtract_one && i == 0)
1756 p = last_p;
1757 else if (subtract_one)
1758 p = le64toh(array->entry_array.items[i-1]);
1759 else
1760 p = le64toh(array->entry_array.items[i]);
1761
1762 r = journal_file_move_to_object(f, OBJECT_ENTRY, p, &o);
1763 if (r < 0)
1764 return r;
1765
1766 if (ret)
1767 *ret = o;
1768
1769 if (offset)
1770 *offset = p;
1771
1772 if (idx)
cbdca852 1773 *idx = t + i + (subtract_one ? -1 : 0);
de190aef
LP
1774
1775 return 1;
cec736d2
LP
1776}
1777
f268980d
LP
1778static int generic_array_bisect_plus_one(
1779 JournalFile *f,
1780 uint64_t extra,
1781 uint64_t first,
1782 uint64_t n,
1783 uint64_t needle,
1784 int (*test_object)(JournalFile *f, uint64_t p, uint64_t needle),
1785 direction_t direction,
1786 Object **ret,
1787 uint64_t *offset,
1788 uint64_t *idx) {
de190aef 1789
cec736d2 1790 int r;
cbdca852
LP
1791 bool step_back = false;
1792 Object *o;
cec736d2
LP
1793
1794 assert(f);
de190aef 1795 assert(test_object);
cec736d2 1796
de190aef
LP
1797 if (n <= 0)
1798 return 0;
cec736d2 1799
de190aef
LP
1800 /* This bisects the array in object 'first', but first checks
1801 * an extra */
de190aef
LP
1802 r = test_object(f, extra, needle);
1803 if (r < 0)
1804 return r;
a536e261
LP
1805
1806 if (r == TEST_FOUND)
1807 r = direction == DIRECTION_DOWN ? TEST_RIGHT : TEST_LEFT;
1808
cbdca852
LP
1809 /* if we are looking with DIRECTION_UP then we need to first
1810 see if in the actual array there is a matching entry, and
1811 return the last one of that. But if there isn't any we need
1812 to return this one. Hence remember this, and return it
1813 below. */
1814 if (r == TEST_LEFT)
1815 step_back = direction == DIRECTION_UP;
de190aef 1816
cbdca852
LP
1817 if (r == TEST_RIGHT) {
1818 if (direction == DIRECTION_DOWN)
1819 goto found;
1820 else
1821 return 0;
a536e261 1822 }
cec736d2 1823
de190aef
LP
1824 r = generic_array_bisect(f, first, n-1, needle, test_object, direction, ret, offset, idx);
1825
cbdca852
LP
1826 if (r == 0 && step_back)
1827 goto found;
1828
ecf68b1d 1829 if (r > 0 && idx)
de190aef
LP
1830 (*idx) ++;
1831
1832 return r;
cbdca852
LP
1833
1834found:
1835 r = journal_file_move_to_object(f, OBJECT_ENTRY, extra, &o);
1836 if (r < 0)
1837 return r;
1838
1839 if (ret)
1840 *ret = o;
1841
1842 if (offset)
1843 *offset = extra;
1844
1845 if (idx)
1846 *idx = 0;
1847
1848 return 1;
1849}
1850
44a6b1b6 1851_pure_ static int test_object_offset(JournalFile *f, uint64_t p, uint64_t needle) {
cbdca852
LP
1852 assert(f);
1853 assert(p > 0);
1854
1855 if (p == needle)
1856 return TEST_FOUND;
1857 else if (p < needle)
1858 return TEST_LEFT;
1859 else
1860 return TEST_RIGHT;
1861}
1862
de190aef
LP
1863static int test_object_seqnum(JournalFile *f, uint64_t p, uint64_t needle) {
1864 Object *o;
1865 int r;
1866
1867 assert(f);
1868 assert(p > 0);
1869
1870 r = journal_file_move_to_object(f, OBJECT_ENTRY, p, &o);
cec736d2
LP
1871 if (r < 0)
1872 return r;
1873
de190aef
LP
1874 if (le64toh(o->entry.seqnum) == needle)
1875 return TEST_FOUND;
1876 else if (le64toh(o->entry.seqnum) < needle)
1877 return TEST_LEFT;
1878 else
1879 return TEST_RIGHT;
1880}
cec736d2 1881
de190aef
LP
1882int journal_file_move_to_entry_by_seqnum(
1883 JournalFile *f,
1884 uint64_t seqnum,
1885 direction_t direction,
1886 Object **ret,
1887 uint64_t *offset) {
1888
1889 return generic_array_bisect(f,
1890 le64toh(f->header->entry_array_offset),
1891 le64toh(f->header->n_entries),
1892 seqnum,
1893 test_object_seqnum,
1894 direction,
1895 ret, offset, NULL);
1896}
cec736d2 1897
de190aef
LP
1898static int test_object_realtime(JournalFile *f, uint64_t p, uint64_t needle) {
1899 Object *o;
1900 int r;
1901
1902 assert(f);
1903 assert(p > 0);
1904
1905 r = journal_file_move_to_object(f, OBJECT_ENTRY, p, &o);
1906 if (r < 0)
1907 return r;
1908
1909 if (le64toh(o->entry.realtime) == needle)
1910 return TEST_FOUND;
1911 else if (le64toh(o->entry.realtime) < needle)
1912 return TEST_LEFT;
1913 else
1914 return TEST_RIGHT;
cec736d2
LP
1915}
1916
de190aef
LP
1917int journal_file_move_to_entry_by_realtime(
1918 JournalFile *f,
1919 uint64_t realtime,
1920 direction_t direction,
1921 Object **ret,
1922 uint64_t *offset) {
1923
1924 return generic_array_bisect(f,
1925 le64toh(f->header->entry_array_offset),
1926 le64toh(f->header->n_entries),
1927 realtime,
1928 test_object_realtime,
1929 direction,
1930 ret, offset, NULL);
1931}
1932
1933static int test_object_monotonic(JournalFile *f, uint64_t p, uint64_t needle) {
1934 Object *o;
1935 int r;
1936
1937 assert(f);
1938 assert(p > 0);
1939
1940 r = journal_file_move_to_object(f, OBJECT_ENTRY, p, &o);
1941 if (r < 0)
1942 return r;
1943
1944 if (le64toh(o->entry.monotonic) == needle)
1945 return TEST_FOUND;
1946 else if (le64toh(o->entry.monotonic) < needle)
1947 return TEST_LEFT;
1948 else
1949 return TEST_RIGHT;
1950}
1951
47838ab3
ZJS
1952static inline int find_data_object_by_boot_id(
1953 JournalFile *f,
1954 sd_id128_t boot_id,
1955 Object **o,
1956 uint64_t *b) {
1957 char t[sizeof("_BOOT_ID=")-1 + 32 + 1] = "_BOOT_ID=";
1958
1959 sd_id128_to_string(boot_id, t + 9);
1960 return journal_file_find_data_object(f, t, sizeof(t) - 1, o, b);
1961}
1962
de190aef
LP
1963int journal_file_move_to_entry_by_monotonic(
1964 JournalFile *f,
1965 sd_id128_t boot_id,
1966 uint64_t monotonic,
1967 direction_t direction,
1968 Object **ret,
1969 uint64_t *offset) {
1970
de190aef
LP
1971 Object *o;
1972 int r;
1973
cbdca852 1974 assert(f);
de190aef 1975
47838ab3 1976 r = find_data_object_by_boot_id(f, boot_id, &o, NULL);
de190aef
LP
1977 if (r < 0)
1978 return r;
cbdca852 1979 if (r == 0)
de190aef
LP
1980 return -ENOENT;
1981
1982 return generic_array_bisect_plus_one(f,
1983 le64toh(o->data.entry_offset),
1984 le64toh(o->data.entry_array_offset),
1985 le64toh(o->data.n_entries),
1986 monotonic,
1987 test_object_monotonic,
1988 direction,
1989 ret, offset, NULL);
1990}
1991
1fc605b0 1992void journal_file_reset_location(JournalFile *f) {
6573ef05 1993 f->location_type = LOCATION_HEAD;
1fc605b0 1994 f->current_offset = 0;
6573ef05
MS
1995 f->current_seqnum = 0;
1996 f->current_realtime = 0;
1997 f->current_monotonic = 0;
1998 zero(f->current_boot_id);
1999 f->current_xor_hash = 0;
2000}
2001
2002void journal_file_save_location(JournalFile *f, direction_t direction, Object *o, uint64_t offset) {
2003 f->last_direction = direction;
2004 f->location_type = LOCATION_SEEK;
2005 f->current_offset = offset;
2006 f->current_seqnum = le64toh(o->entry.seqnum);
2007 f->current_realtime = le64toh(o->entry.realtime);
2008 f->current_monotonic = le64toh(o->entry.monotonic);
2009 f->current_boot_id = o->entry.boot_id;
2010 f->current_xor_hash = le64toh(o->entry.xor_hash);
1fc605b0
MS
2011}
2012
d8ae66d7
MS
2013int journal_file_compare_locations(JournalFile *af, JournalFile *bf) {
2014 assert(af);
2015 assert(bf);
2016 assert(af->location_type == LOCATION_SEEK);
2017 assert(bf->location_type == LOCATION_SEEK);
2018
2019 /* If contents and timestamps match, these entries are
2020 * identical, even if the seqnum does not match */
2021 if (sd_id128_equal(af->current_boot_id, bf->current_boot_id) &&
2022 af->current_monotonic == bf->current_monotonic &&
2023 af->current_realtime == bf->current_realtime &&
2024 af->current_xor_hash == bf->current_xor_hash)
2025 return 0;
2026
2027 if (sd_id128_equal(af->header->seqnum_id, bf->header->seqnum_id)) {
2028
2029 /* If this is from the same seqnum source, compare
2030 * seqnums */
2031 if (af->current_seqnum < bf->current_seqnum)
2032 return -1;
2033 if (af->current_seqnum > bf->current_seqnum)
2034 return 1;
2035
2036 /* Wow! This is weird, different data but the same
2037 * seqnums? Something is borked, but let's make the
2038 * best of it and compare by time. */
2039 }
2040
2041 if (sd_id128_equal(af->current_boot_id, bf->current_boot_id)) {
2042
2043 /* If the boot id matches, compare monotonic time */
2044 if (af->current_monotonic < bf->current_monotonic)
2045 return -1;
2046 if (af->current_monotonic > bf->current_monotonic)
2047 return 1;
2048 }
2049
2050 /* Otherwise, compare UTC time */
2051 if (af->current_realtime < bf->current_realtime)
2052 return -1;
2053 if (af->current_realtime > bf->current_realtime)
2054 return 1;
2055
2056 /* Finally, compare by contents */
2057 if (af->current_xor_hash < bf->current_xor_hash)
2058 return -1;
2059 if (af->current_xor_hash > bf->current_xor_hash)
2060 return 1;
2061
2062 return 0;
2063}
2064
de190aef
LP
2065int journal_file_next_entry(
2066 JournalFile *f,
f534928a 2067 uint64_t p,
de190aef
LP
2068 direction_t direction,
2069 Object **ret, uint64_t *offset) {
2070
fb099c8d 2071 uint64_t i, n, ofs;
cec736d2
LP
2072 int r;
2073
2074 assert(f);
de190aef
LP
2075
2076 n = le64toh(f->header->n_entries);
2077 if (n <= 0)
2078 return 0;
cec736d2 2079
f534928a 2080 if (p == 0)
de190aef 2081 i = direction == DIRECTION_DOWN ? 0 : n - 1;
cec736d2 2082 else {
de190aef
LP
2083 r = generic_array_bisect(f,
2084 le64toh(f->header->entry_array_offset),
2085 le64toh(f->header->n_entries),
2086 p,
2087 test_object_offset,
2088 DIRECTION_DOWN,
2089 NULL, NULL,
2090 &i);
2091 if (r <= 0)
2092 return r;
2093
2094 if (direction == DIRECTION_DOWN) {
2095 if (i >= n - 1)
2096 return 0;
2097
2098 i++;
2099 } else {
2100 if (i <= 0)
2101 return 0;
2102
2103 i--;
2104 }
cec736d2
LP
2105 }
2106
de190aef 2107 /* And jump to it */
fb099c8d
ZJS
2108 r = generic_array_get(f,
2109 le64toh(f->header->entry_array_offset),
2110 i,
2111 ret, &ofs);
2112 if (r <= 0)
2113 return r;
2114
2115 if (p > 0 &&
2116 (direction == DIRECTION_DOWN ? ofs <= p : ofs >= p)) {
2117 log_debug("%s: entry array corrupted at entry %"PRIu64,
2118 f->path, i);
2119 return -EBADMSG;
2120 }
2121
2122 if (offset)
2123 *offset = ofs;
2124
2125 return 1;
de190aef 2126}
cec736d2 2127
de190aef
LP
2128int journal_file_next_entry_for_data(
2129 JournalFile *f,
2130 Object *o, uint64_t p,
2131 uint64_t data_offset,
2132 direction_t direction,
2133 Object **ret, uint64_t *offset) {
2134
2135 uint64_t n, i;
cec736d2 2136 int r;
de190aef 2137 Object *d;
cec736d2
LP
2138
2139 assert(f);
de190aef 2140 assert(p > 0 || !o);
cec736d2 2141
de190aef 2142 r = journal_file_move_to_object(f, OBJECT_DATA, data_offset, &d);
466ccd92 2143 if (r < 0)
de190aef 2144 return r;
cec736d2 2145
de190aef
LP
2146 n = le64toh(d->data.n_entries);
2147 if (n <= 0)
2148 return n;
cec736d2 2149
de190aef
LP
2150 if (!o)
2151 i = direction == DIRECTION_DOWN ? 0 : n - 1;
2152 else {
2153 if (o->object.type != OBJECT_ENTRY)
2154 return -EINVAL;
cec736d2 2155
de190aef
LP
2156 r = generic_array_bisect_plus_one(f,
2157 le64toh(d->data.entry_offset),
2158 le64toh(d->data.entry_array_offset),
2159 le64toh(d->data.n_entries),
2160 p,
2161 test_object_offset,
2162 DIRECTION_DOWN,
2163 NULL, NULL,
2164 &i);
2165
2166 if (r <= 0)
cec736d2
LP
2167 return r;
2168
de190aef
LP
2169 if (direction == DIRECTION_DOWN) {
2170 if (i >= n - 1)
2171 return 0;
cec736d2 2172
de190aef
LP
2173 i++;
2174 } else {
2175 if (i <= 0)
2176 return 0;
cec736d2 2177
de190aef
LP
2178 i--;
2179 }
cec736d2 2180
de190aef 2181 }
cec736d2 2182
de190aef
LP
2183 return generic_array_get_plus_one(f,
2184 le64toh(d->data.entry_offset),
2185 le64toh(d->data.entry_array_offset),
2186 i,
2187 ret, offset);
2188}
cec736d2 2189
cbdca852
LP
2190int journal_file_move_to_entry_by_offset_for_data(
2191 JournalFile *f,
2192 uint64_t data_offset,
2193 uint64_t p,
2194 direction_t direction,
2195 Object **ret, uint64_t *offset) {
2196
2197 int r;
2198 Object *d;
2199
2200 assert(f);
2201
2202 r = journal_file_move_to_object(f, OBJECT_DATA, data_offset, &d);
2203 if (r < 0)
2204 return r;
2205
2206 return generic_array_bisect_plus_one(f,
2207 le64toh(d->data.entry_offset),
2208 le64toh(d->data.entry_array_offset),
2209 le64toh(d->data.n_entries),
2210 p,
2211 test_object_offset,
2212 direction,
2213 ret, offset, NULL);
2214}
2215
2216int journal_file_move_to_entry_by_monotonic_for_data(
2217 JournalFile *f,
2218 uint64_t data_offset,
2219 sd_id128_t boot_id,
2220 uint64_t monotonic,
2221 direction_t direction,
2222 Object **ret, uint64_t *offset) {
2223
cbdca852
LP
2224 Object *o, *d;
2225 int r;
2226 uint64_t b, z;
2227
2228 assert(f);
2229
2230 /* First, seek by time */
47838ab3 2231 r = find_data_object_by_boot_id(f, boot_id, &o, &b);
cbdca852
LP
2232 if (r < 0)
2233 return r;
2234 if (r == 0)
2235 return -ENOENT;
2236
2237 r = generic_array_bisect_plus_one(f,
2238 le64toh(o->data.entry_offset),
2239 le64toh(o->data.entry_array_offset),
2240 le64toh(o->data.n_entries),
2241 monotonic,
2242 test_object_monotonic,
2243 direction,
2244 NULL, &z, NULL);
2245 if (r <= 0)
2246 return r;
2247
2248 /* And now, continue seeking until we find an entry that
2249 * exists in both bisection arrays */
2250
2251 for (;;) {
2252 Object *qo;
2253 uint64_t p, q;
2254
2255 r = journal_file_move_to_object(f, OBJECT_DATA, data_offset, &d);
2256 if (r < 0)
2257 return r;
2258
2259 r = generic_array_bisect_plus_one(f,
2260 le64toh(d->data.entry_offset),
2261 le64toh(d->data.entry_array_offset),
2262 le64toh(d->data.n_entries),
2263 z,
2264 test_object_offset,
2265 direction,
2266 NULL, &p, NULL);
2267 if (r <= 0)
2268 return r;
2269
2270 r = journal_file_move_to_object(f, OBJECT_DATA, b, &o);
2271 if (r < 0)
2272 return r;
2273
2274 r = generic_array_bisect_plus_one(f,
2275 le64toh(o->data.entry_offset),
2276 le64toh(o->data.entry_array_offset),
2277 le64toh(o->data.n_entries),
2278 p,
2279 test_object_offset,
2280 direction,
2281 &qo, &q, NULL);
2282
2283 if (r <= 0)
2284 return r;
2285
2286 if (p == q) {
2287 if (ret)
2288 *ret = qo;
2289 if (offset)
2290 *offset = q;
2291
2292 return 1;
2293 }
2294
2295 z = q;
2296 }
cbdca852
LP
2297}
2298
de190aef
LP
2299int journal_file_move_to_entry_by_seqnum_for_data(
2300 JournalFile *f,
2301 uint64_t data_offset,
2302 uint64_t seqnum,
2303 direction_t direction,
2304 Object **ret, uint64_t *offset) {
cec736d2 2305
de190aef
LP
2306 Object *d;
2307 int r;
cec736d2 2308
91a31dde
LP
2309 assert(f);
2310
de190aef 2311 r = journal_file_move_to_object(f, OBJECT_DATA, data_offset, &d);
91a31dde 2312 if (r < 0)
de190aef 2313 return r;
cec736d2 2314
de190aef
LP
2315 return generic_array_bisect_plus_one(f,
2316 le64toh(d->data.entry_offset),
2317 le64toh(d->data.entry_array_offset),
2318 le64toh(d->data.n_entries),
2319 seqnum,
2320 test_object_seqnum,
2321 direction,
2322 ret, offset, NULL);
2323}
cec736d2 2324
de190aef
LP
2325int journal_file_move_to_entry_by_realtime_for_data(
2326 JournalFile *f,
2327 uint64_t data_offset,
2328 uint64_t realtime,
2329 direction_t direction,
2330 Object **ret, uint64_t *offset) {
2331
2332 Object *d;
2333 int r;
2334
91a31dde
LP
2335 assert(f);
2336
de190aef 2337 r = journal_file_move_to_object(f, OBJECT_DATA, data_offset, &d);
91a31dde 2338 if (r < 0)
de190aef
LP
2339 return r;
2340
2341 return generic_array_bisect_plus_one(f,
2342 le64toh(d->data.entry_offset),
2343 le64toh(d->data.entry_array_offset),
2344 le64toh(d->data.n_entries),
2345 realtime,
2346 test_object_realtime,
2347 direction,
2348 ret, offset, NULL);
cec736d2
LP
2349}
2350
0284adc6 2351void journal_file_dump(JournalFile *f) {
7560fffc 2352 Object *o;
7560fffc 2353 int r;
0284adc6 2354 uint64_t p;
7560fffc
LP
2355
2356 assert(f);
2357
0284adc6 2358 journal_file_print_header(f);
7560fffc 2359
0284adc6
LP
2360 p = le64toh(f->header->header_size);
2361 while (p != 0) {
d05089d8 2362 r = journal_file_move_to_object(f, OBJECT_UNUSED, p, &o);
0284adc6
LP
2363 if (r < 0)
2364 goto fail;
7560fffc 2365
0284adc6 2366 switch (o->object.type) {
d98cc1f2 2367
0284adc6
LP
2368 case OBJECT_UNUSED:
2369 printf("Type: OBJECT_UNUSED\n");
2370 break;
d98cc1f2 2371
0284adc6
LP
2372 case OBJECT_DATA:
2373 printf("Type: OBJECT_DATA\n");
2374 break;
7560fffc 2375
3c1668da
LP
2376 case OBJECT_FIELD:
2377 printf("Type: OBJECT_FIELD\n");
2378 break;
2379
0284adc6 2380 case OBJECT_ENTRY:
507f22bd
ZJS
2381 printf("Type: OBJECT_ENTRY seqnum=%"PRIu64" monotonic=%"PRIu64" realtime=%"PRIu64"\n",
2382 le64toh(o->entry.seqnum),
2383 le64toh(o->entry.monotonic),
2384 le64toh(o->entry.realtime));
0284adc6 2385 break;
7560fffc 2386
0284adc6
LP
2387 case OBJECT_FIELD_HASH_TABLE:
2388 printf("Type: OBJECT_FIELD_HASH_TABLE\n");
2389 break;
7560fffc 2390
0284adc6
LP
2391 case OBJECT_DATA_HASH_TABLE:
2392 printf("Type: OBJECT_DATA_HASH_TABLE\n");
2393 break;
7560fffc 2394
0284adc6
LP
2395 case OBJECT_ENTRY_ARRAY:
2396 printf("Type: OBJECT_ENTRY_ARRAY\n");
2397 break;
7560fffc 2398
0284adc6 2399 case OBJECT_TAG:
507f22bd
ZJS
2400 printf("Type: OBJECT_TAG seqnum=%"PRIu64" epoch=%"PRIu64"\n",
2401 le64toh(o->tag.seqnum),
2402 le64toh(o->tag.epoch));
0284adc6 2403 break;
3c1668da
LP
2404
2405 default:
2406 printf("Type: unknown (%u)\n", o->object.type);
2407 break;
0284adc6 2408 }
7560fffc 2409
d89c8fdf
ZJS
2410 if (o->object.flags & OBJECT_COMPRESSION_MASK)
2411 printf("Flags: %s\n",
2412 object_compressed_to_string(o->object.flags & OBJECT_COMPRESSION_MASK));
7560fffc 2413
0284adc6
LP
2414 if (p == le64toh(f->header->tail_object_offset))
2415 p = 0;
2416 else
2417 p = p + ALIGN64(le64toh(o->object.size));
2418 }
7560fffc 2419
0284adc6
LP
2420 return;
2421fail:
2422 log_error("File corrupt");
7560fffc
LP
2423}
2424
718fe4b1
ZJS
2425static const char* format_timestamp_safe(char *buf, size_t l, usec_t t) {
2426 const char *x;
2427
2428 x = format_timestamp(buf, l, t);
2429 if (x)
2430 return x;
2431 return " --- ";
2432}
2433
0284adc6 2434void journal_file_print_header(JournalFile *f) {
2765b7bb 2435 char a[33], b[33], c[33], d[33];
ed375beb 2436 char x[FORMAT_TIMESTAMP_MAX], y[FORMAT_TIMESTAMP_MAX], z[FORMAT_TIMESTAMP_MAX];
a1a03e30
LP
2437 struct stat st;
2438 char bytes[FORMAT_BYTES_MAX];
7560fffc
LP
2439
2440 assert(f);
7560fffc 2441
0284adc6
LP
2442 printf("File Path: %s\n"
2443 "File ID: %s\n"
2444 "Machine ID: %s\n"
2445 "Boot ID: %s\n"
2446 "Sequential Number ID: %s\n"
2447 "State: %s\n"
2448 "Compatible Flags:%s%s\n"
d89c8fdf 2449 "Incompatible Flags:%s%s%s\n"
507f22bd
ZJS
2450 "Header size: %"PRIu64"\n"
2451 "Arena size: %"PRIu64"\n"
2452 "Data Hash Table Size: %"PRIu64"\n"
2453 "Field Hash Table Size: %"PRIu64"\n"
0284adc6 2454 "Rotate Suggested: %s\n"
507f22bd
ZJS
2455 "Head Sequential Number: %"PRIu64"\n"
2456 "Tail Sequential Number: %"PRIu64"\n"
0284adc6 2457 "Head Realtime Timestamp: %s\n"
3223f44f 2458 "Tail Realtime Timestamp: %s\n"
ed375beb 2459 "Tail Monotonic Timestamp: %s\n"
507f22bd
ZJS
2460 "Objects: %"PRIu64"\n"
2461 "Entry Objects: %"PRIu64"\n",
0284adc6
LP
2462 f->path,
2463 sd_id128_to_string(f->header->file_id, a),
2464 sd_id128_to_string(f->header->machine_id, b),
2465 sd_id128_to_string(f->header->boot_id, c),
2765b7bb 2466 sd_id128_to_string(f->header->seqnum_id, d),
3223f44f
LP
2467 f->header->state == STATE_OFFLINE ? "OFFLINE" :
2468 f->header->state == STATE_ONLINE ? "ONLINE" :
2469 f->header->state == STATE_ARCHIVED ? "ARCHIVED" : "UNKNOWN",
8088cbd3 2470 JOURNAL_HEADER_SEALED(f->header) ? " SEALED" : "",
d89c8fdf
ZJS
2471 (le32toh(f->header->compatible_flags) & ~HEADER_COMPATIBLE_ANY) ? " ???" : "",
2472 JOURNAL_HEADER_COMPRESSED_XZ(f->header) ? " COMPRESSED-XZ" : "",
2473 JOURNAL_HEADER_COMPRESSED_LZ4(f->header) ? " COMPRESSED-LZ4" : "",
2474 (le32toh(f->header->incompatible_flags) & ~HEADER_INCOMPATIBLE_ANY) ? " ???" : "",
507f22bd
ZJS
2475 le64toh(f->header->header_size),
2476 le64toh(f->header->arena_size),
2477 le64toh(f->header->data_hash_table_size) / sizeof(HashItem),
2478 le64toh(f->header->field_hash_table_size) / sizeof(HashItem),
fb0951b0 2479 yes_no(journal_file_rotate_suggested(f, 0)),
507f22bd
ZJS
2480 le64toh(f->header->head_entry_seqnum),
2481 le64toh(f->header->tail_entry_seqnum),
718fe4b1
ZJS
2482 format_timestamp_safe(x, sizeof(x), le64toh(f->header->head_entry_realtime)),
2483 format_timestamp_safe(y, sizeof(y), le64toh(f->header->tail_entry_realtime)),
ed375beb 2484 format_timespan(z, sizeof(z), le64toh(f->header->tail_entry_monotonic), USEC_PER_MSEC),
507f22bd
ZJS
2485 le64toh(f->header->n_objects),
2486 le64toh(f->header->n_entries));
7560fffc 2487
0284adc6 2488 if (JOURNAL_HEADER_CONTAINS(f->header, n_data))
507f22bd 2489 printf("Data Objects: %"PRIu64"\n"
0284adc6 2490 "Data Hash Table Fill: %.1f%%\n",
507f22bd 2491 le64toh(f->header->n_data),
0284adc6 2492 100.0 * (double) le64toh(f->header->n_data) / ((double) (le64toh(f->header->data_hash_table_size) / sizeof(HashItem))));
7560fffc 2493
0284adc6 2494 if (JOURNAL_HEADER_CONTAINS(f->header, n_fields))
507f22bd 2495 printf("Field Objects: %"PRIu64"\n"
0284adc6 2496 "Field Hash Table Fill: %.1f%%\n",
507f22bd 2497 le64toh(f->header->n_fields),
0284adc6 2498 100.0 * (double) le64toh(f->header->n_fields) / ((double) (le64toh(f->header->field_hash_table_size) / sizeof(HashItem))));
3223f44f
LP
2499
2500 if (JOURNAL_HEADER_CONTAINS(f->header, n_tags))
507f22bd
ZJS
2501 printf("Tag Objects: %"PRIu64"\n",
2502 le64toh(f->header->n_tags));
3223f44f 2503 if (JOURNAL_HEADER_CONTAINS(f->header, n_entry_arrays))
507f22bd
ZJS
2504 printf("Entry Array Objects: %"PRIu64"\n",
2505 le64toh(f->header->n_entry_arrays));
a1a03e30
LP
2506
2507 if (fstat(f->fd, &st) >= 0)
2508 printf("Disk usage: %s\n", format_bytes(bytes, sizeof(bytes), (off_t) st.st_blocks * 512ULL));
7560fffc
LP
2509}
2510
0284adc6
LP
2511int journal_file_open(
2512 const char *fname,
2513 int flags,
2514 mode_t mode,
2515 bool compress,
baed47c3 2516 bool seal,
0284adc6
LP
2517 JournalMetrics *metrics,
2518 MMapCache *mmap_cache,
2519 JournalFile *template,
2520 JournalFile **ret) {
7560fffc 2521
fa6ac760 2522 bool newly_created = false;
0284adc6 2523 JournalFile *f;
fa6ac760 2524 void *h;
0284adc6 2525 int r;
7560fffc 2526
0284adc6 2527 assert(fname);
0559d3a5 2528 assert(ret);
7560fffc 2529
0284adc6
LP
2530 if ((flags & O_ACCMODE) != O_RDONLY &&
2531 (flags & O_ACCMODE) != O_RDWR)
2532 return -EINVAL;
7560fffc 2533
a0108012
LP
2534 if (!endswith(fname, ".journal") &&
2535 !endswith(fname, ".journal~"))
0284adc6 2536 return -EINVAL;
7560fffc 2537
0284adc6
LP
2538 f = new0(JournalFile, 1);
2539 if (!f)
2540 return -ENOMEM;
7560fffc 2541
0284adc6
LP
2542 f->fd = -1;
2543 f->mode = mode;
7560fffc 2544
0284adc6
LP
2545 f->flags = flags;
2546 f->prot = prot_from_flags(flags);
2547 f->writable = (flags & O_ACCMODE) != O_RDONLY;
92261977 2548#if defined(HAVE_LZ4)
d89c8fdf
ZJS
2549 f->compress_lz4 = compress;
2550#elif defined(HAVE_XZ)
2551 f->compress_xz = compress;
48b61739 2552#endif
49a32d43 2553#ifdef HAVE_GCRYPT
baed47c3 2554 f->seal = seal;
49a32d43 2555#endif
7560fffc 2556
0284adc6
LP
2557 if (mmap_cache)
2558 f->mmap = mmap_cache_ref(mmap_cache);
2559 else {
84168d80 2560 f->mmap = mmap_cache_new();
0284adc6
LP
2561 if (!f->mmap) {
2562 r = -ENOMEM;
2563 goto fail;
2564 }
2565 }
7560fffc 2566
0284adc6
LP
2567 f->path = strdup(fname);
2568 if (!f->path) {
2569 r = -ENOMEM;
2570 goto fail;
2571 }
7560fffc 2572
4743015d 2573 f->chain_cache = ordered_hashmap_new(&uint64_hash_ops);
a4bcff5b
LP
2574 if (!f->chain_cache) {
2575 r = -ENOMEM;
2576 goto fail;
2577 }
2578
0284adc6
LP
2579 f->fd = open(f->path, f->flags|O_CLOEXEC, f->mode);
2580 if (f->fd < 0) {
2581 r = -errno;
2582 goto fail;
7560fffc 2583 }
7560fffc 2584
2678031a
LP
2585 r = journal_file_fstat(f);
2586 if (r < 0)
0284adc6 2587 goto fail;
7560fffc 2588
0284adc6 2589 if (f->last_stat.st_size == 0 && f->writable) {
fb0951b0
LP
2590 /* Let's attach the creation time to the journal file,
2591 * so that the vacuuming code knows the age of this
2592 * file even if the file might end up corrupted one
2593 * day... Ideally we'd just use the creation time many
2594 * file systems maintain for each file, but there is
2595 * currently no usable API to query this, hence let's
2596 * emulate this via extended attributes. If extended
2597 * attributes are not supported we'll just skip this,
7517e174 2598 * and rely solely on mtime/atime/ctime of the file. */
fb0951b0 2599
4a4d89b6 2600 fd_setcrtime(f->fd, now(CLOCK_REALTIME));
7560fffc 2601
feb12d3e 2602#ifdef HAVE_GCRYPT
0284adc6 2603 /* Try to load the FSPRG state, and if we can't, then
baed47c3 2604 * just don't do sealing */
49a32d43
LP
2605 if (f->seal) {
2606 r = journal_file_fss_load(f);
2607 if (r < 0)
2608 f->seal = false;
2609 }
feb12d3e 2610#endif
7560fffc 2611
0284adc6
LP
2612 r = journal_file_init_header(f, template);
2613 if (r < 0)
2614 goto fail;
7560fffc 2615
2678031a
LP
2616 r = journal_file_fstat(f);
2617 if (r < 0)
0284adc6 2618 goto fail;
fb0951b0
LP
2619
2620 newly_created = true;
0284adc6 2621 }
7560fffc 2622
0284adc6
LP
2623 if (f->last_stat.st_size < (off_t) HEADER_SIZE_MIN) {
2624 r = -EIO;
2625 goto fail;
2626 }
7560fffc 2627
fa6ac760
LP
2628 r = mmap_cache_get(f->mmap, f->fd, f->prot, CONTEXT_HEADER, true, 0, PAGE_ALIGN(sizeof(Header)), &f->last_stat, &h);
2629 if (r < 0) {
0284adc6
LP
2630 r = -errno;
2631 goto fail;
2632 }
7560fffc 2633
fa6ac760
LP
2634 f->header = h;
2635
0284adc6
LP
2636 if (!newly_created) {
2637 r = journal_file_verify_header(f);
2638 if (r < 0)
2639 goto fail;
2640 }
7560fffc 2641
feb12d3e 2642#ifdef HAVE_GCRYPT
0284adc6 2643 if (!newly_created && f->writable) {
baed47c3 2644 r = journal_file_fss_load(f);
0284adc6
LP
2645 if (r < 0)
2646 goto fail;
2647 }
feb12d3e 2648#endif
cec736d2
LP
2649
2650 if (f->writable) {
4a92baf3
LP
2651 if (metrics) {
2652 journal_default_metrics(metrics, f->fd);
2653 f->metrics = *metrics;
2654 } else if (template)
2655 f->metrics = template->metrics;
2656
cec736d2
LP
2657 r = journal_file_refresh_header(f);
2658 if (r < 0)
2659 goto fail;
2660 }
2661
feb12d3e 2662#ifdef HAVE_GCRYPT
baed47c3 2663 r = journal_file_hmac_setup(f);
14d10188
LP
2664 if (r < 0)
2665 goto fail;
feb12d3e 2666#endif
14d10188 2667
cec736d2 2668 if (newly_created) {
de190aef 2669 r = journal_file_setup_field_hash_table(f);
cec736d2
LP
2670 if (r < 0)
2671 goto fail;
2672
de190aef 2673 r = journal_file_setup_data_hash_table(f);
cec736d2
LP
2674 if (r < 0)
2675 goto fail;
7560fffc 2676
feb12d3e 2677#ifdef HAVE_GCRYPT
7560fffc
LP
2678 r = journal_file_append_first_tag(f);
2679 if (r < 0)
2680 goto fail;
feb12d3e 2681#endif
cec736d2
LP
2682 }
2683
de190aef 2684 r = journal_file_map_field_hash_table(f);
cec736d2
LP
2685 if (r < 0)
2686 goto fail;
2687
de190aef 2688 r = journal_file_map_data_hash_table(f);
cec736d2
LP
2689 if (r < 0)
2690 goto fail;
2691
fa6ac760
LP
2692 if (mmap_cache_got_sigbus(f->mmap, f->fd)) {
2693 r = -EIO;
2694 goto fail;
2695 }
2696
0559d3a5 2697 *ret = f;
cec736d2
LP
2698 return 0;
2699
2700fail:
fa6ac760
LP
2701 if (f->fd >= 0 && mmap_cache_got_sigbus(f->mmap, f->fd))
2702 r = -EIO;
2703
cec736d2
LP
2704 journal_file_close(f);
2705
2706 return r;
2707}
0ac38b70 2708
baed47c3 2709int journal_file_rotate(JournalFile **f, bool compress, bool seal) {
57535f47 2710 _cleanup_free_ char *p = NULL;
0ac38b70
LP
2711 size_t l;
2712 JournalFile *old_file, *new_file = NULL;
2713 int r;
2714
2715 assert(f);
2716 assert(*f);
2717
2718 old_file = *f;
2719
2720 if (!old_file->writable)
2721 return -EINVAL;
2722
2723 if (!endswith(old_file->path, ".journal"))
2724 return -EINVAL;
2725
2726 l = strlen(old_file->path);
57535f47
ZJS
2727 r = asprintf(&p, "%.*s@" SD_ID128_FORMAT_STR "-%016"PRIx64"-%016"PRIx64".journal",
2728 (int) l - 8, old_file->path,
2729 SD_ID128_FORMAT_VAL(old_file->header->seqnum_id),
2730 le64toh((*f)->header->head_entry_seqnum),
2731 le64toh((*f)->header->head_entry_realtime));
2732 if (r < 0)
0ac38b70
LP
2733 return -ENOMEM;
2734
2678031a
LP
2735 /* Try to rename the file to the archived version. If the file
2736 * already was deleted, we'll get ENOENT, let's ignore that
2737 * case. */
0ac38b70 2738 r = rename(old_file->path, p);
2678031a 2739 if (r < 0 && errno != ENOENT)
0ac38b70
LP
2740 return -errno;
2741
ccdbaf91 2742 old_file->header->state = STATE_ARCHIVED;
0ac38b70 2743
baed47c3 2744 r = journal_file_open(old_file->path, old_file->flags, old_file->mode, compress, seal, NULL, old_file->mmap, old_file, &new_file);
0ac38b70
LP
2745 journal_file_close(old_file);
2746
2747 *f = new_file;
2748 return r;
2749}
2750
9447a7f1
LP
2751int journal_file_open_reliably(
2752 const char *fname,
2753 int flags,
2754 mode_t mode,
7560fffc 2755 bool compress,
baed47c3 2756 bool seal,
4a92baf3 2757 JournalMetrics *metrics,
27370278 2758 MMapCache *mmap_cache,
9447a7f1
LP
2759 JournalFile *template,
2760 JournalFile **ret) {
2761
2762 int r;
2763 size_t l;
ed375beb 2764 _cleanup_free_ char *p = NULL;
9447a7f1 2765
baed47c3 2766 r = journal_file_open(fname, flags, mode, compress, seal,
27370278 2767 metrics, mmap_cache, template, ret);
0071d9f1
LP
2768 if (r != -EBADMSG && /* corrupted */
2769 r != -ENODATA && /* truncated */
2770 r != -EHOSTDOWN && /* other machine */
a1a1898f
LP
2771 r != -EPROTONOSUPPORT && /* incompatible feature */
2772 r != -EBUSY && /* unclean shutdown */
fa6ac760
LP
2773 r != -ESHUTDOWN && /* already archived */
2774 r != -EIO /* IO error, including SIGBUS on mmap */)
9447a7f1
LP
2775 return r;
2776
2777 if ((flags & O_ACCMODE) == O_RDONLY)
2778 return r;
2779
2780 if (!(flags & O_CREAT))
2781 return r;
2782
7560fffc
LP
2783 if (!endswith(fname, ".journal"))
2784 return r;
2785
5c70eab4
LP
2786 /* The file is corrupted. Rotate it away and try it again (but only once) */
2787
9447a7f1 2788 l = strlen(fname);
9bf3b535 2789 if (asprintf(&p, "%.*s@%016llx-%016" PRIx64 ".journal~",
57535f47 2790 (int) l - 8, fname,
9447a7f1 2791 (unsigned long long) now(CLOCK_REALTIME),
9bf3b535 2792 random_u64()) < 0)
9447a7f1
LP
2793 return -ENOMEM;
2794
2795 r = rename(fname, p);
9447a7f1
LP
2796 if (r < 0)
2797 return -errno;
2798
a1a1898f 2799 log_warning("File %s corrupted or uncleanly shut down, renaming and replacing.", fname);
9447a7f1 2800
baed47c3 2801 return journal_file_open(fname, flags, mode, compress, seal,
27370278 2802 metrics, mmap_cache, template, ret);
9447a7f1
LP
2803}
2804
cf244689
LP
2805int journal_file_copy_entry(JournalFile *from, JournalFile *to, Object *o, uint64_t p, uint64_t *seqnum, Object **ret, uint64_t *offset) {
2806 uint64_t i, n;
2807 uint64_t q, xor_hash = 0;
2808 int r;
2809 EntryItem *items;
2810 dual_timestamp ts;
2811
2812 assert(from);
2813 assert(to);
2814 assert(o);
2815 assert(p);
2816
2817 if (!to->writable)
2818 return -EPERM;
2819
2820 ts.monotonic = le64toh(o->entry.monotonic);
2821 ts.realtime = le64toh(o->entry.realtime);
2822
cf244689 2823 n = journal_file_entry_n_items(o);
4faa7004
TA
2824 /* alloca() can't take 0, hence let's allocate at least one */
2825 items = alloca(sizeof(EntryItem) * MAX(1u, n));
cf244689
LP
2826
2827 for (i = 0; i < n; i++) {
4fd052ae
FC
2828 uint64_t l, h;
2829 le64_t le_hash;
cf244689
LP
2830 size_t t;
2831 void *data;
2832 Object *u;
2833
2834 q = le64toh(o->entry.items[i].object_offset);
2835 le_hash = o->entry.items[i].hash;
2836
2837 r = journal_file_move_to_object(from, OBJECT_DATA, q, &o);
2838 if (r < 0)
2839 return r;
2840
2841 if (le_hash != o->data.hash)
2842 return -EBADMSG;
2843
2844 l = le64toh(o->object.size) - offsetof(Object, data.payload);
2845 t = (size_t) l;
2846
2847 /* We hit the limit on 32bit machines */
2848 if ((uint64_t) t != l)
2849 return -E2BIG;
2850
d89c8fdf 2851 if (o->object.flags & OBJECT_COMPRESSION_MASK) {
3b1a55e1 2852#if defined(HAVE_XZ) || defined(HAVE_LZ4)
fa1c4b51 2853 size_t rsize;
cf244689 2854
d89c8fdf
ZJS
2855 r = decompress_blob(o->object.flags & OBJECT_COMPRESSION_MASK,
2856 o->data.payload, l, &from->compress_buffer, &from->compress_buffer_size, &rsize, 0);
2857 if (r < 0)
2858 return r;
cf244689
LP
2859
2860 data = from->compress_buffer;
2861 l = rsize;
3b1a55e1
ZJS
2862#else
2863 return -EPROTONOSUPPORT;
2864#endif
cf244689
LP
2865 } else
2866 data = o->data.payload;
2867
2868 r = journal_file_append_data(to, data, l, &u, &h);
2869 if (r < 0)
2870 return r;
2871
2872 xor_hash ^= le64toh(u->data.hash);
2873 items[i].object_offset = htole64(h);
2874 items[i].hash = u->data.hash;
2875
2876 r = journal_file_move_to_object(from, OBJECT_ENTRY, p, &o);
2877 if (r < 0)
2878 return r;
2879 }
2880
fa6ac760
LP
2881 r = journal_file_append_entry_internal(to, &ts, xor_hash, items, n, seqnum, ret, offset);
2882
2883 if (mmap_cache_got_sigbus(to->mmap, to->fd))
2884 return -EIO;
2885
2886 return r;
cf244689 2887}
babfc091
LP
2888
2889void journal_default_metrics(JournalMetrics *m, int fd) {
2890 uint64_t fs_size = 0;
2891 struct statvfs ss;
a7bc2c2a 2892 char a[FORMAT_BYTES_MAX], b[FORMAT_BYTES_MAX], c[FORMAT_BYTES_MAX], d[FORMAT_BYTES_MAX];
babfc091
LP
2893
2894 assert(m);
2895 assert(fd >= 0);
2896
2897 if (fstatvfs(fd, &ss) >= 0)
2898 fs_size = ss.f_frsize * ss.f_blocks;
2899
2900 if (m->max_use == (uint64_t) -1) {
2901
2902 if (fs_size > 0) {
2903 m->max_use = PAGE_ALIGN(fs_size / 10); /* 10% of file system size */
2904
2905 if (m->max_use > DEFAULT_MAX_USE_UPPER)
2906 m->max_use = DEFAULT_MAX_USE_UPPER;
2907
2908 if (m->max_use < DEFAULT_MAX_USE_LOWER)
2909 m->max_use = DEFAULT_MAX_USE_LOWER;
2910 } else
2911 m->max_use = DEFAULT_MAX_USE_LOWER;
2912 } else {
2913 m->max_use = PAGE_ALIGN(m->max_use);
2914
2915 if (m->max_use < JOURNAL_FILE_SIZE_MIN*2)
2916 m->max_use = JOURNAL_FILE_SIZE_MIN*2;
2917 }
2918
2919 if (m->max_size == (uint64_t) -1) {
2920 m->max_size = PAGE_ALIGN(m->max_use / 8); /* 8 chunks */
2921
2922 if (m->max_size > DEFAULT_MAX_SIZE_UPPER)
2923 m->max_size = DEFAULT_MAX_SIZE_UPPER;
2924 } else
2925 m->max_size = PAGE_ALIGN(m->max_size);
2926
2927 if (m->max_size < JOURNAL_FILE_SIZE_MIN)
2928 m->max_size = JOURNAL_FILE_SIZE_MIN;
2929
2930 if (m->max_size*2 > m->max_use)
2931 m->max_use = m->max_size*2;
2932
2933 if (m->min_size == (uint64_t) -1)
2934 m->min_size = JOURNAL_FILE_SIZE_MIN;
2935 else {
2936 m->min_size = PAGE_ALIGN(m->min_size);
2937
2938 if (m->min_size < JOURNAL_FILE_SIZE_MIN)
2939 m->min_size = JOURNAL_FILE_SIZE_MIN;
2940
2941 if (m->min_size > m->max_size)
2942 m->max_size = m->min_size;
2943 }
2944
2945 if (m->keep_free == (uint64_t) -1) {
2946
2947 if (fs_size > 0) {
8621b110 2948 m->keep_free = PAGE_ALIGN(fs_size * 3 / 20); /* 15% of file system size */
babfc091
LP
2949
2950 if (m->keep_free > DEFAULT_KEEP_FREE_UPPER)
2951 m->keep_free = DEFAULT_KEEP_FREE_UPPER;
2952
2953 } else
2954 m->keep_free = DEFAULT_KEEP_FREE;
2955 }
2956
2b43f939
LP
2957 log_debug("Fixed max_use=%s max_size=%s min_size=%s keep_free=%s",
2958 format_bytes(a, sizeof(a), m->max_use),
2959 format_bytes(b, sizeof(b), m->max_size),
2960 format_bytes(c, sizeof(c), m->min_size),
2961 format_bytes(d, sizeof(d), m->keep_free));
babfc091 2962}
08984293
LP
2963
2964int journal_file_get_cutoff_realtime_usec(JournalFile *f, usec_t *from, usec_t *to) {
08984293
LP
2965 assert(f);
2966 assert(from || to);
2967
2968 if (from) {
162566a4
LP
2969 if (f->header->head_entry_realtime == 0)
2970 return -ENOENT;
08984293 2971
162566a4 2972 *from = le64toh(f->header->head_entry_realtime);
08984293
LP
2973 }
2974
2975 if (to) {
162566a4
LP
2976 if (f->header->tail_entry_realtime == 0)
2977 return -ENOENT;
08984293 2978
162566a4 2979 *to = le64toh(f->header->tail_entry_realtime);
08984293
LP
2980 }
2981
2982 return 1;
2983}
2984
2985int journal_file_get_cutoff_monotonic_usec(JournalFile *f, sd_id128_t boot_id, usec_t *from, usec_t *to) {
08984293
LP
2986 Object *o;
2987 uint64_t p;
2988 int r;
2989
2990 assert(f);
2991 assert(from || to);
2992
47838ab3 2993 r = find_data_object_by_boot_id(f, boot_id, &o, &p);
08984293
LP
2994 if (r <= 0)
2995 return r;
2996
2997 if (le64toh(o->data.n_entries) <= 0)
2998 return 0;
2999
3000 if (from) {
3001 r = journal_file_move_to_object(f, OBJECT_ENTRY, le64toh(o->data.entry_offset), &o);
3002 if (r < 0)
3003 return r;
3004
3005 *from = le64toh(o->entry.monotonic);
3006 }
3007
3008 if (to) {
3009 r = journal_file_move_to_object(f, OBJECT_DATA, p, &o);
3010 if (r < 0)
3011 return r;
3012
3013 r = generic_array_get_plus_one(f,
3014 le64toh(o->data.entry_offset),
3015 le64toh(o->data.entry_array_offset),
3016 le64toh(o->data.n_entries)-1,
3017 &o, NULL);
3018 if (r <= 0)
3019 return r;
3020
3021 *to = le64toh(o->entry.monotonic);
3022 }
3023
3024 return 1;
3025}
dca6219e 3026
fb0951b0 3027bool journal_file_rotate_suggested(JournalFile *f, usec_t max_file_usec) {
dca6219e
LP
3028 assert(f);
3029
3030 /* If we gained new header fields we gained new features,
3031 * hence suggest a rotation */
361f9cbc
LP
3032 if (le64toh(f->header->header_size) < sizeof(Header)) {
3033 log_debug("%s uses an outdated header, suggesting rotation.", f->path);
dca6219e 3034 return true;
361f9cbc 3035 }
dca6219e
LP
3036
3037 /* Let's check if the hash tables grew over a certain fill
3038 * level (75%, borrowing this value from Java's hash table
3039 * implementation), and if so suggest a rotation. To calculate
3040 * the fill level we need the n_data field, which only exists
3041 * in newer versions. */
3042
3043 if (JOURNAL_HEADER_CONTAINS(f->header, n_data))
361f9cbc 3044 if (le64toh(f->header->n_data) * 4ULL > (le64toh(f->header->data_hash_table_size) / sizeof(HashItem)) * 3ULL) {
507f22bd 3045 log_debug("Data hash table of %s has a fill level at %.1f (%"PRIu64" of %"PRIu64" items, %llu file size, %"PRIu64" bytes per hash table item), suggesting rotation.",
361f9cbc
LP
3046 f->path,
3047 100.0 * (double) le64toh(f->header->n_data) / ((double) (le64toh(f->header->data_hash_table_size) / sizeof(HashItem))),
507f22bd
ZJS
3048 le64toh(f->header->n_data),
3049 le64toh(f->header->data_hash_table_size) / sizeof(HashItem),
3050 (unsigned long long) f->last_stat.st_size,
3051 f->last_stat.st_size / le64toh(f->header->n_data));
dca6219e 3052 return true;
361f9cbc 3053 }
dca6219e
LP
3054
3055 if (JOURNAL_HEADER_CONTAINS(f->header, n_fields))
361f9cbc 3056 if (le64toh(f->header->n_fields) * 4ULL > (le64toh(f->header->field_hash_table_size) / sizeof(HashItem)) * 3ULL) {
507f22bd 3057 log_debug("Field hash table of %s has a fill level at %.1f (%"PRIu64" of %"PRIu64" items), suggesting rotation.",
361f9cbc
LP
3058 f->path,
3059 100.0 * (double) le64toh(f->header->n_fields) / ((double) (le64toh(f->header->field_hash_table_size) / sizeof(HashItem))),
507f22bd
ZJS
3060 le64toh(f->header->n_fields),
3061 le64toh(f->header->field_hash_table_size) / sizeof(HashItem));
dca6219e 3062 return true;
361f9cbc 3063 }
dca6219e 3064
0598fd4a
LP
3065 /* Are the data objects properly indexed by field objects? */
3066 if (JOURNAL_HEADER_CONTAINS(f->header, n_data) &&
3067 JOURNAL_HEADER_CONTAINS(f->header, n_fields) &&
3068 le64toh(f->header->n_data) > 0 &&
3069 le64toh(f->header->n_fields) == 0)
3070 return true;
3071
fb0951b0
LP
3072 if (max_file_usec > 0) {
3073 usec_t t, h;
3074
3075 h = le64toh(f->header->head_entry_realtime);
3076 t = now(CLOCK_REALTIME);
3077
3078 if (h > 0 && t > h + max_file_usec)
3079 return true;
3080 }
3081
dca6219e
LP
3082 return false;
3083}