]> git.ipfire.org Git - thirdparty/systemd.git/blame - src/journal/journal-file.c
endian: explicitly include endian.h wherever we want to use __BYTE_ORDER
[thirdparty/systemd.git] / src / journal / journal-file.c
CommitLineData
cec736d2
LP
1/*-*- Mode: C; c-basic-offset: 8; indent-tabs-mode: nil -*-*/
2
3/***
4 This file is part of systemd.
5
6 Copyright 2011 Lennart Poettering
7
8 systemd is free software; you can redistribute it and/or modify it
5430f7f2
LP
9 under the terms of the GNU Lesser General Public License as published by
10 the Free Software Foundation; either version 2.1 of the License, or
cec736d2
LP
11 (at your option) any later version.
12
13 systemd is distributed in the hope that it will be useful, but
14 WITHOUT ANY WARRANTY; without even the implied warranty of
15 MERCHANTABILITY or FITNESS FOR A PARTICULAR PURPOSE. See the GNU
5430f7f2 16 Lesser General Public License for more details.
cec736d2 17
5430f7f2 18 You should have received a copy of the GNU Lesser General Public License
cec736d2
LP
19 along with systemd; If not, see <http://www.gnu.org/licenses/>.
20***/
21
22#include <sys/mman.h>
23#include <errno.h>
24#include <sys/uio.h>
25#include <unistd.h>
26#include <sys/statvfs.h>
27#include <fcntl.h>
28#include <stddef.h>
d2edfae0 29#include <sys/xattr.h>
fb0951b0 30
cec736d2
LP
31#include "journal-def.h"
32#include "journal-file.h"
0284adc6 33#include "journal-authenticate.h"
cec736d2 34#include "lookup3.h"
807e17f0 35#include "compress.h"
7560fffc 36#include "fsprg.h"
cec736d2 37
4a92baf3
LP
38#define DEFAULT_DATA_HASH_TABLE_SIZE (2047ULL*sizeof(HashItem))
39#define DEFAULT_FIELD_HASH_TABLE_SIZE (333ULL*sizeof(HashItem))
cec736d2 40
be19b7df 41#define COMPRESSION_SIZE_THRESHOLD (512ULL)
807e17f0 42
babfc091 43/* This is the minimum journal file size */
253f59df 44#define JOURNAL_FILE_SIZE_MIN (4ULL*1024ULL*1024ULL) /* 4 MiB */
babfc091
LP
45
46/* These are the lower and upper bounds if we deduce the max_use value
47 * from the file system size */
48#define DEFAULT_MAX_USE_LOWER (1ULL*1024ULL*1024ULL) /* 1 MiB */
49#define DEFAULT_MAX_USE_UPPER (4ULL*1024ULL*1024ULL*1024ULL) /* 4 GiB */
50
51/* This is the upper bound if we deduce max_size from max_use */
71100051 52#define DEFAULT_MAX_SIZE_UPPER (128ULL*1024ULL*1024ULL) /* 128 MiB */
babfc091
LP
53
54/* This is the upper bound if we deduce the keep_free value from the
55 * file system size */
56#define DEFAULT_KEEP_FREE_UPPER (4ULL*1024ULL*1024ULL*1024ULL) /* 4 GiB */
57
58/* This is the keep_free value when we can't determine the system
59 * size */
60#define DEFAULT_KEEP_FREE (1024ULL*1024ULL) /* 1 MB */
61
dca6219e
LP
62/* n_data was the first entry we added after the initial file format design */
63#define HEADER_SIZE_MIN ALIGN64(offsetof(Header, n_data))
cec736d2 64
a4bcff5b
LP
65/* How many entries to keep in the entry array chain cache at max */
66#define CHAIN_CACHE_MAX 20
67
a676e665
LP
68/* How much to increase the journal file size at once each time we allocate something new. */
69#define FILE_SIZE_INCREASE (8ULL*1024ULL*1024ULL) /* 8MB */
70
9588bc32 71static int journal_file_set_online(JournalFile *f) {
26687bf8
OS
72 assert(f);
73
74 if (!f->writable)
75 return -EPERM;
76
77 if (!(f->fd >= 0 && f->header))
78 return -EINVAL;
79
80 switch(f->header->state) {
81 case STATE_ONLINE:
82 return 0;
83
84 case STATE_OFFLINE:
85 f->header->state = STATE_ONLINE;
86 fsync(f->fd);
87 return 0;
88
89 default:
90 return -EINVAL;
91 }
92}
93
94int journal_file_set_offline(JournalFile *f) {
95 assert(f);
96
97 if (!f->writable)
98 return -EPERM;
99
100 if (!(f->fd >= 0 && f->header))
101 return -EINVAL;
102
103 if (f->header->state != STATE_ONLINE)
104 return 0;
105
106 fsync(f->fd);
107
108 f->header->state = STATE_OFFLINE;
109
110 fsync(f->fd);
111
112 return 0;
113}
114
cec736d2 115void journal_file_close(JournalFile *f) {
de190aef 116 assert(f);
cec736d2 117
feb12d3e 118#ifdef HAVE_GCRYPT
b0af6f41 119 /* Write the final tag */
c586dbf1 120 if (f->seal && f->writable)
b0af6f41 121 journal_file_append_tag(f);
feb12d3e 122#endif
b0af6f41 123
7560fffc 124 /* Sync everything to disk, before we mark the file offline */
16e9f408
LP
125 if (f->mmap && f->fd >= 0)
126 mmap_cache_close_fd(f->mmap, f->fd);
7560fffc 127
26687bf8 128 journal_file_set_offline(f);
cec736d2 129
26687bf8 130 if (f->header)
d384c7a8 131 munmap(f->header, PAGE_ALIGN(sizeof(Header)));
cec736d2 132
03e334a1 133 safe_close(f->fd);
cec736d2 134 free(f->path);
807e17f0 135
16e9f408
LP
136 if (f->mmap)
137 mmap_cache_unref(f->mmap);
138
a4bcff5b
LP
139 hashmap_free_free(f->chain_cache);
140
d89c8fdf 141#if defined(HAVE_XZ) || defined(HAVE_LZ4)
807e17f0
LP
142 free(f->compress_buffer);
143#endif
144
7560fffc 145#ifdef HAVE_GCRYPT
baed47c3
LP
146 if (f->fss_file)
147 munmap(f->fss_file, PAGE_ALIGN(f->fss_file_size));
b7c9ae91
LP
148 else if (f->fsprg_state)
149 free(f->fsprg_state);
150
151 free(f->fsprg_seed);
7560fffc
LP
152
153 if (f->hmac)
154 gcry_md_close(f->hmac);
155#endif
156
cec736d2
LP
157 free(f);
158}
159
0ac38b70 160static int journal_file_init_header(JournalFile *f, JournalFile *template) {
d89c8fdf 161 Header h = {};
cec736d2
LP
162 ssize_t k;
163 int r;
164
165 assert(f);
166
7560fffc 167 memcpy(h.signature, HEADER_SIGNATURE, 8);
23b0b2b2 168 h.header_size = htole64(ALIGN64(sizeof(h)));
cec736d2 169
d89c8fdf
ZJS
170 h.incompatible_flags |= htole32(
171 f->compress_xz * HEADER_INCOMPATIBLE_COMPRESSED_XZ |
172 f->compress_lz4 * HEADER_INCOMPATIBLE_COMPRESSED_LZ4);
7560fffc 173
d89c8fdf
ZJS
174 h.compatible_flags = htole32(
175 f->seal * HEADER_COMPATIBLE_SEALED);
7560fffc 176
cec736d2
LP
177 r = sd_id128_randomize(&h.file_id);
178 if (r < 0)
179 return r;
180
0ac38b70
LP
181 if (template) {
182 h.seqnum_id = template->header->seqnum_id;
beec0085 183 h.tail_entry_seqnum = template->header->tail_entry_seqnum;
0ac38b70
LP
184 } else
185 h.seqnum_id = h.file_id;
cec736d2
LP
186
187 k = pwrite(f->fd, &h, sizeof(h), 0);
188 if (k < 0)
189 return -errno;
190
191 if (k != sizeof(h))
192 return -EIO;
193
194 return 0;
195}
196
197static int journal_file_refresh_header(JournalFile *f) {
198 int r;
de190aef 199 sd_id128_t boot_id;
cec736d2
LP
200
201 assert(f);
202
203 r = sd_id128_get_machine(&f->header->machine_id);
204 if (r < 0)
205 return r;
206
de190aef 207 r = sd_id128_get_boot(&boot_id);
cec736d2
LP
208 if (r < 0)
209 return r;
210
de190aef
LP
211 if (sd_id128_equal(boot_id, f->header->boot_id))
212 f->tail_entry_monotonic_valid = true;
213
214 f->header->boot_id = boot_id;
215
26687bf8 216 journal_file_set_online(f);
b788cc23 217
7560fffc 218 /* Sync the online state to disk */
a676e665 219 fsync(f->fd);
b788cc23 220
cec736d2
LP
221 return 0;
222}
223
224static int journal_file_verify_header(JournalFile *f) {
d89c8fdf
ZJS
225 uint32_t flags;
226
cec736d2
LP
227 assert(f);
228
7560fffc 229 if (memcmp(f->header->signature, HEADER_SIGNATURE, 8))
cec736d2
LP
230 return -EBADMSG;
231
7560fffc
LP
232 /* In both read and write mode we refuse to open files with
233 * incompatible flags we don't know */
d89c8fdf
ZJS
234 flags = le32toh(f->header->incompatible_flags);
235 if (flags & ~HEADER_INCOMPATIBLE_SUPPORTED) {
236 if (flags & ~HEADER_INCOMPATIBLE_ANY)
237 log_debug("Journal file %s has unknown incompatible flags %"PRIx32,
238 f->path, flags & ~HEADER_INCOMPATIBLE_ANY);
239 flags = (flags & HEADER_INCOMPATIBLE_ANY) & ~HEADER_INCOMPATIBLE_SUPPORTED;
240 if (flags)
241 log_debug("Journal file %s uses incompatible flags %"PRIx32
242 " disabled at compilation time.", f->path, flags);
cec736d2 243 return -EPROTONOSUPPORT;
d89c8fdf 244 }
cec736d2 245
7560fffc
LP
246 /* When open for writing we refuse to open files with
247 * compatible flags, too */
d89c8fdf
ZJS
248 flags = le32toh(f->header->compatible_flags);
249 if (f->writable && (flags & ~HEADER_COMPATIBLE_SUPPORTED)) {
250 if (flags & ~HEADER_COMPATIBLE_ANY)
251 log_debug("Journal file %s has unknown compatible flags %"PRIx32,
252 f->path, flags & ~HEADER_COMPATIBLE_ANY);
253 flags = (flags & HEADER_COMPATIBLE_ANY) & ~HEADER_COMPATIBLE_SUPPORTED;
254 if (flags)
255 log_debug("Journal file %s uses compatible flags %"PRIx32
256 " disabled at compilation time.", f->path, flags);
257 return -EPROTONOSUPPORT;
7560fffc
LP
258 }
259
db11ac1a
LP
260 if (f->header->state >= _STATE_MAX)
261 return -EBADMSG;
262
dca6219e
LP
263 /* The first addition was n_data, so check that we are at least this large */
264 if (le64toh(f->header->header_size) < HEADER_SIZE_MIN)
23b0b2b2
LP
265 return -EBADMSG;
266
8088cbd3 267 if (JOURNAL_HEADER_SEALED(f->header) && !JOURNAL_HEADER_CONTAINS(f->header, n_entry_arrays))
beec0085
LP
268 return -EBADMSG;
269
db11ac1a
LP
270 if ((le64toh(f->header->header_size) + le64toh(f->header->arena_size)) > (uint64_t) f->last_stat.st_size)
271 return -ENODATA;
272
273 if (le64toh(f->header->tail_object_offset) > (le64toh(f->header->header_size) + le64toh(f->header->arena_size)))
274 return -ENODATA;
275
7762e02b
LP
276 if (!VALID64(le64toh(f->header->data_hash_table_offset)) ||
277 !VALID64(le64toh(f->header->field_hash_table_offset)) ||
278 !VALID64(le64toh(f->header->tail_object_offset)) ||
279 !VALID64(le64toh(f->header->entry_array_offset)))
280 return -ENODATA;
281
282 if (le64toh(f->header->data_hash_table_offset) < le64toh(f->header->header_size) ||
283 le64toh(f->header->field_hash_table_offset) < le64toh(f->header->header_size) ||
284 le64toh(f->header->tail_object_offset) < le64toh(f->header->header_size) ||
285 le64toh(f->header->entry_array_offset) < le64toh(f->header->header_size))
cec736d2
LP
286 return -ENODATA;
287
288 if (f->writable) {
ccdbaf91 289 uint8_t state;
cec736d2
LP
290 sd_id128_t machine_id;
291 int r;
292
293 r = sd_id128_get_machine(&machine_id);
294 if (r < 0)
295 return r;
296
297 if (!sd_id128_equal(machine_id, f->header->machine_id))
298 return -EHOSTDOWN;
299
de190aef 300 state = f->header->state;
cec736d2 301
71fa6f00
LP
302 if (state == STATE_ONLINE) {
303 log_debug("Journal file %s is already online. Assuming unclean closing.", f->path);
304 return -EBUSY;
305 } else if (state == STATE_ARCHIVED)
cec736d2 306 return -ESHUTDOWN;
71fa6f00
LP
307 else if (state != STATE_OFFLINE) {
308 log_debug("Journal file %s has unknown state %u.", f->path, state);
309 return -EBUSY;
310 }
cec736d2
LP
311 }
312
d89c8fdf
ZJS
313 f->compress_xz = JOURNAL_HEADER_COMPRESSED_XZ(f->header);
314 f->compress_lz4 = JOURNAL_HEADER_COMPRESSED_LZ4(f->header);
c586dbf1 315
f1889c91 316 f->seal = JOURNAL_HEADER_SEALED(f->header);
7560fffc 317
cec736d2
LP
318 return 0;
319}
320
321static int journal_file_allocate(JournalFile *f, uint64_t offset, uint64_t size) {
eda4b58b 322 uint64_t old_size, new_size;
fec2aa2f 323 int r;
cec736d2
LP
324
325 assert(f);
326
cec736d2 327 /* We assume that this file is not sparse, and we know that
38ac38b2 328 * for sure, since we always call posix_fallocate()
cec736d2
LP
329 * ourselves */
330
331 old_size =
23b0b2b2 332 le64toh(f->header->header_size) +
cec736d2
LP
333 le64toh(f->header->arena_size);
334
bc85bfee 335 new_size = PAGE_ALIGN(offset + size);
23b0b2b2
LP
336 if (new_size < le64toh(f->header->header_size))
337 new_size = le64toh(f->header->header_size);
bc85bfee
LP
338
339 if (new_size <= old_size)
cec736d2
LP
340 return 0;
341
a676e665 342 if (f->metrics.max_size > 0 && new_size > f->metrics.max_size)
bc85bfee 343 return -E2BIG;
cec736d2 344
a676e665 345 if (new_size > f->metrics.min_size && f->metrics.keep_free > 0) {
cec736d2
LP
346 struct statvfs svfs;
347
348 if (fstatvfs(f->fd, &svfs) >= 0) {
349 uint64_t available;
350
351 available = svfs.f_bfree * svfs.f_bsize;
352
bc85bfee
LP
353 if (available >= f->metrics.keep_free)
354 available -= f->metrics.keep_free;
cec736d2
LP
355 else
356 available = 0;
357
358 if (new_size - old_size > available)
359 return -E2BIG;
360 }
361 }
362
eda4b58b
LP
363 /* Increase by larger blocks at once */
364 new_size = ((new_size+FILE_SIZE_INCREASE-1) / FILE_SIZE_INCREASE) * FILE_SIZE_INCREASE;
365 if (f->metrics.max_size > 0 && new_size > f->metrics.max_size)
366 new_size = f->metrics.max_size;
367
bc85bfee
LP
368 /* Note that the glibc fallocate() fallback is very
369 inefficient, hence we try to minimize the allocation area
370 as we can. */
fec2aa2f
GV
371 r = posix_fallocate(f->fd, old_size, new_size - old_size);
372 if (r != 0)
373 return -r;
cec736d2 374
eda4b58b
LP
375 if (fstat(f->fd, &f->last_stat) < 0)
376 return -errno;
cec736d2 377
23b0b2b2 378 f->header->arena_size = htole64(new_size - le64toh(f->header->header_size));
cec736d2
LP
379
380 return 0;
381}
382
fcde2389 383static int journal_file_move_to(JournalFile *f, int context, bool keep_always, uint64_t offset, uint64_t size, void **ret) {
cec736d2 384 assert(f);
cec736d2
LP
385 assert(ret);
386
7762e02b
LP
387 if (size <= 0)
388 return -EINVAL;
389
2a59ea54 390 /* Avoid SIGBUS on invalid accesses */
4bbdcdb3
LP
391 if (offset + size > (uint64_t) f->last_stat.st_size) {
392 /* Hmm, out of range? Let's refresh the fstat() data
393 * first, before we trust that check. */
394
395 if (fstat(f->fd, &f->last_stat) < 0 ||
396 offset + size > (uint64_t) f->last_stat.st_size)
397 return -EADDRNOTAVAIL;
398 }
399
fcde2389 400 return mmap_cache_get(f->mmap, f->fd, f->prot, context, keep_always, offset, size, &f->last_stat, ret);
cec736d2
LP
401}
402
16e9f408
LP
403static uint64_t minimum_header_size(Object *o) {
404
b8e891e6 405 static const uint64_t table[] = {
16e9f408
LP
406 [OBJECT_DATA] = sizeof(DataObject),
407 [OBJECT_FIELD] = sizeof(FieldObject),
408 [OBJECT_ENTRY] = sizeof(EntryObject),
409 [OBJECT_DATA_HASH_TABLE] = sizeof(HashTableObject),
410 [OBJECT_FIELD_HASH_TABLE] = sizeof(HashTableObject),
411 [OBJECT_ENTRY_ARRAY] = sizeof(EntryArrayObject),
412 [OBJECT_TAG] = sizeof(TagObject),
413 };
414
415 if (o->object.type >= ELEMENTSOF(table) || table[o->object.type] <= 0)
416 return sizeof(ObjectHeader);
417
418 return table[o->object.type];
419}
420
de190aef 421int journal_file_move_to_object(JournalFile *f, int type, uint64_t offset, Object **ret) {
cec736d2
LP
422 int r;
423 void *t;
424 Object *o;
425 uint64_t s;
426
427 assert(f);
428 assert(ret);
429
db11ac1a
LP
430 /* Objects may only be located at multiple of 64 bit */
431 if (!VALID64(offset))
432 return -EFAULT;
433
16e9f408 434
ae97089d 435 r = journal_file_move_to(f, type_to_context(type), false, offset, sizeof(ObjectHeader), &t);
cec736d2
LP
436 if (r < 0)
437 return r;
438
439 o = (Object*) t;
440 s = le64toh(o->object.size);
441
442 if (s < sizeof(ObjectHeader))
443 return -EBADMSG;
444
16e9f408
LP
445 if (o->object.type <= OBJECT_UNUSED)
446 return -EBADMSG;
447
448 if (s < minimum_header_size(o))
449 return -EBADMSG;
450
3c1668da 451 if (type > 0 && o->object.type != type)
cec736d2
LP
452 return -EBADMSG;
453
454 if (s > sizeof(ObjectHeader)) {
fcde2389 455 r = journal_file_move_to(f, o->object.type, false, offset, s, &t);
cec736d2
LP
456 if (r < 0)
457 return r;
458
459 o = (Object*) t;
460 }
461
cec736d2
LP
462 *ret = o;
463 return 0;
464}
465
d98cc1f2 466static uint64_t journal_file_entry_seqnum(JournalFile *f, uint64_t *seqnum) {
cec736d2
LP
467 uint64_t r;
468
469 assert(f);
470
beec0085 471 r = le64toh(f->header->tail_entry_seqnum) + 1;
c2373f84
LP
472
473 if (seqnum) {
de190aef 474 /* If an external seqnum counter was passed, we update
c2373f84
LP
475 * both the local and the external one, and set it to
476 * the maximum of both */
477
478 if (*seqnum + 1 > r)
479 r = *seqnum + 1;
480
481 *seqnum = r;
482 }
483
beec0085 484 f->header->tail_entry_seqnum = htole64(r);
cec736d2 485
beec0085
LP
486 if (f->header->head_entry_seqnum == 0)
487 f->header->head_entry_seqnum = htole64(r);
de190aef 488
cec736d2
LP
489 return r;
490}
491
0284adc6 492int journal_file_append_object(JournalFile *f, int type, uint64_t size, Object **ret, uint64_t *offset) {
cec736d2
LP
493 int r;
494 uint64_t p;
495 Object *tail, *o;
496 void *t;
497
498 assert(f);
16e9f408 499 assert(type > 0 && type < _OBJECT_TYPE_MAX);
cec736d2
LP
500 assert(size >= sizeof(ObjectHeader));
501 assert(offset);
502 assert(ret);
503
26687bf8
OS
504 r = journal_file_set_online(f);
505 if (r < 0)
506 return r;
507
cec736d2 508 p = le64toh(f->header->tail_object_offset);
cec736d2 509 if (p == 0)
23b0b2b2 510 p = le64toh(f->header->header_size);
cec736d2 511 else {
de190aef 512 r = journal_file_move_to_object(f, -1, p, &tail);
cec736d2
LP
513 if (r < 0)
514 return r;
515
516 p += ALIGN64(le64toh(tail->object.size));
517 }
518
519 r = journal_file_allocate(f, p, size);
520 if (r < 0)
521 return r;
522
fcde2389 523 r = journal_file_move_to(f, type, false, p, size, &t);
cec736d2
LP
524 if (r < 0)
525 return r;
526
527 o = (Object*) t;
528
529 zero(o->object);
de190aef 530 o->object.type = type;
cec736d2
LP
531 o->object.size = htole64(size);
532
533 f->header->tail_object_offset = htole64(p);
cec736d2
LP
534 f->header->n_objects = htole64(le64toh(f->header->n_objects) + 1);
535
536 *ret = o;
537 *offset = p;
538
539 return 0;
540}
541
de190aef 542static int journal_file_setup_data_hash_table(JournalFile *f) {
cec736d2
LP
543 uint64_t s, p;
544 Object *o;
545 int r;
546
547 assert(f);
548
dfabe643 549 /* We estimate that we need 1 hash table entry per 768 of
4a92baf3
LP
550 journal file and we want to make sure we never get beyond
551 75% fill level. Calculate the hash table size for the
552 maximum file size based on these metrics. */
553
dfabe643 554 s = (f->metrics.max_size * 4 / 768 / 3) * sizeof(HashItem);
4a92baf3
LP
555 if (s < DEFAULT_DATA_HASH_TABLE_SIZE)
556 s = DEFAULT_DATA_HASH_TABLE_SIZE;
557
507f22bd 558 log_debug("Reserving %"PRIu64" entries in hash table.", s / sizeof(HashItem));
4a92baf3 559
de190aef
LP
560 r = journal_file_append_object(f,
561 OBJECT_DATA_HASH_TABLE,
562 offsetof(Object, hash_table.items) + s,
563 &o, &p);
cec736d2
LP
564 if (r < 0)
565 return r;
566
29804cc1 567 memzero(o->hash_table.items, s);
cec736d2 568
de190aef
LP
569 f->header->data_hash_table_offset = htole64(p + offsetof(Object, hash_table.items));
570 f->header->data_hash_table_size = htole64(s);
cec736d2
LP
571
572 return 0;
573}
574
de190aef 575static int journal_file_setup_field_hash_table(JournalFile *f) {
cec736d2
LP
576 uint64_t s, p;
577 Object *o;
578 int r;
579
580 assert(f);
581
3c1668da
LP
582 /* We use a fixed size hash table for the fields as this
583 * number should grow very slowly only */
584
de190aef
LP
585 s = DEFAULT_FIELD_HASH_TABLE_SIZE;
586 r = journal_file_append_object(f,
587 OBJECT_FIELD_HASH_TABLE,
588 offsetof(Object, hash_table.items) + s,
589 &o, &p);
cec736d2
LP
590 if (r < 0)
591 return r;
592
29804cc1 593 memzero(o->hash_table.items, s);
cec736d2 594
de190aef
LP
595 f->header->field_hash_table_offset = htole64(p + offsetof(Object, hash_table.items));
596 f->header->field_hash_table_size = htole64(s);
cec736d2
LP
597
598 return 0;
599}
600
de190aef 601static int journal_file_map_data_hash_table(JournalFile *f) {
cec736d2
LP
602 uint64_t s, p;
603 void *t;
604 int r;
605
606 assert(f);
607
de190aef
LP
608 p = le64toh(f->header->data_hash_table_offset);
609 s = le64toh(f->header->data_hash_table_size);
cec736d2 610
de190aef 611 r = journal_file_move_to(f,
16e9f408 612 OBJECT_DATA_HASH_TABLE,
fcde2389 613 true,
de190aef
LP
614 p, s,
615 &t);
cec736d2
LP
616 if (r < 0)
617 return r;
618
de190aef 619 f->data_hash_table = t;
cec736d2
LP
620 return 0;
621}
622
de190aef 623static int journal_file_map_field_hash_table(JournalFile *f) {
cec736d2
LP
624 uint64_t s, p;
625 void *t;
626 int r;
627
628 assert(f);
629
de190aef
LP
630 p = le64toh(f->header->field_hash_table_offset);
631 s = le64toh(f->header->field_hash_table_size);
cec736d2 632
de190aef 633 r = journal_file_move_to(f,
16e9f408 634 OBJECT_FIELD_HASH_TABLE,
fcde2389 635 true,
de190aef
LP
636 p, s,
637 &t);
cec736d2
LP
638 if (r < 0)
639 return r;
640
de190aef 641 f->field_hash_table = t;
cec736d2
LP
642 return 0;
643}
644
3c1668da
LP
645static int journal_file_link_field(
646 JournalFile *f,
647 Object *o,
648 uint64_t offset,
649 uint64_t hash) {
650
651 uint64_t p, h;
652 int r;
653
654 assert(f);
655 assert(o);
656 assert(offset > 0);
657
658 if (o->object.type != OBJECT_FIELD)
659 return -EINVAL;
660
661 /* This might alter the window we are looking at */
662
663 o->field.next_hash_offset = o->field.head_data_offset = 0;
664
665 h = hash % (le64toh(f->header->field_hash_table_size) / sizeof(HashItem));
666 p = le64toh(f->field_hash_table[h].tail_hash_offset);
667 if (p == 0)
668 f->field_hash_table[h].head_hash_offset = htole64(offset);
669 else {
670 r = journal_file_move_to_object(f, OBJECT_FIELD, p, &o);
671 if (r < 0)
672 return r;
673
674 o->field.next_hash_offset = htole64(offset);
675 }
676
677 f->field_hash_table[h].tail_hash_offset = htole64(offset);
678
679 if (JOURNAL_HEADER_CONTAINS(f->header, n_fields))
680 f->header->n_fields = htole64(le64toh(f->header->n_fields) + 1);
681
682 return 0;
683}
684
685static int journal_file_link_data(
686 JournalFile *f,
687 Object *o,
688 uint64_t offset,
689 uint64_t hash) {
690
de190aef 691 uint64_t p, h;
cec736d2
LP
692 int r;
693
694 assert(f);
695 assert(o);
696 assert(offset > 0);
b588975f
LP
697
698 if (o->object.type != OBJECT_DATA)
699 return -EINVAL;
cec736d2 700
48496df6
LP
701 /* This might alter the window we are looking at */
702
de190aef
LP
703 o->data.next_hash_offset = o->data.next_field_offset = 0;
704 o->data.entry_offset = o->data.entry_array_offset = 0;
705 o->data.n_entries = 0;
cec736d2 706
de190aef 707 h = hash % (le64toh(f->header->data_hash_table_size) / sizeof(HashItem));
8db4213e 708 p = le64toh(f->data_hash_table[h].tail_hash_offset);
3c1668da 709 if (p == 0)
cec736d2 710 /* Only entry in the hash table is easy */
de190aef 711 f->data_hash_table[h].head_hash_offset = htole64(offset);
3c1668da 712 else {
48496df6
LP
713 /* Move back to the previous data object, to patch in
714 * pointer */
cec736d2 715
de190aef 716 r = journal_file_move_to_object(f, OBJECT_DATA, p, &o);
cec736d2
LP
717 if (r < 0)
718 return r;
719
de190aef 720 o->data.next_hash_offset = htole64(offset);
cec736d2
LP
721 }
722
de190aef 723 f->data_hash_table[h].tail_hash_offset = htole64(offset);
cec736d2 724
dca6219e
LP
725 if (JOURNAL_HEADER_CONTAINS(f->header, n_data))
726 f->header->n_data = htole64(le64toh(f->header->n_data) + 1);
727
cec736d2
LP
728 return 0;
729}
730
3c1668da
LP
731int journal_file_find_field_object_with_hash(
732 JournalFile *f,
733 const void *field, uint64_t size, uint64_t hash,
734 Object **ret, uint64_t *offset) {
735
736 uint64_t p, osize, h;
737 int r;
738
739 assert(f);
740 assert(field && size > 0);
741
742 osize = offsetof(Object, field.payload) + size;
743
744 if (f->header->field_hash_table_size == 0)
745 return -EBADMSG;
746
747 h = hash % (le64toh(f->header->field_hash_table_size) / sizeof(HashItem));
748 p = le64toh(f->field_hash_table[h].head_hash_offset);
749
750 while (p > 0) {
751 Object *o;
752
753 r = journal_file_move_to_object(f, OBJECT_FIELD, p, &o);
754 if (r < 0)
755 return r;
756
757 if (le64toh(o->field.hash) == hash &&
758 le64toh(o->object.size) == osize &&
759 memcmp(o->field.payload, field, size) == 0) {
760
761 if (ret)
762 *ret = o;
763 if (offset)
764 *offset = p;
765
766 return 1;
767 }
768
769 p = le64toh(o->field.next_hash_offset);
770 }
771
772 return 0;
773}
774
775int journal_file_find_field_object(
776 JournalFile *f,
777 const void *field, uint64_t size,
778 Object **ret, uint64_t *offset) {
779
780 uint64_t hash;
781
782 assert(f);
783 assert(field && size > 0);
784
785 hash = hash64(field, size);
786
787 return journal_file_find_field_object_with_hash(f,
788 field, size, hash,
789 ret, offset);
790}
791
de190aef
LP
792int journal_file_find_data_object_with_hash(
793 JournalFile *f,
794 const void *data, uint64_t size, uint64_t hash,
795 Object **ret, uint64_t *offset) {
48496df6 796
de190aef 797 uint64_t p, osize, h;
cec736d2
LP
798 int r;
799
800 assert(f);
801 assert(data || size == 0);
802
803 osize = offsetof(Object, data.payload) + size;
804
bc85bfee
LP
805 if (f->header->data_hash_table_size == 0)
806 return -EBADMSG;
807
de190aef
LP
808 h = hash % (le64toh(f->header->data_hash_table_size) / sizeof(HashItem));
809 p = le64toh(f->data_hash_table[h].head_hash_offset);
cec736d2 810
de190aef
LP
811 while (p > 0) {
812 Object *o;
cec736d2 813
de190aef 814 r = journal_file_move_to_object(f, OBJECT_DATA, p, &o);
cec736d2
LP
815 if (r < 0)
816 return r;
817
807e17f0 818 if (le64toh(o->data.hash) != hash)
85a131e8 819 goto next;
807e17f0 820
d89c8fdf 821 if (o->object.flags & OBJECT_COMPRESSION_MASK) {
b785c858 822 uint64_t l, rsize;
cec736d2 823
807e17f0
LP
824 l = le64toh(o->object.size);
825 if (l <= offsetof(Object, data.payload))
cec736d2
LP
826 return -EBADMSG;
827
807e17f0
LP
828 l -= offsetof(Object, data.payload);
829
d89c8fdf
ZJS
830 r = decompress_blob(o->object.flags & OBJECT_COMPRESSION_MASK,
831 o->data.payload, l, &f->compress_buffer, &f->compress_buffer_size, &rsize, 0);
832 if (r < 0)
833 return r;
807e17f0 834
b785c858 835 if (rsize == size &&
807e17f0
LP
836 memcmp(f->compress_buffer, data, size) == 0) {
837
838 if (ret)
839 *ret = o;
840
841 if (offset)
842 *offset = p;
843
844 return 1;
845 }
807e17f0
LP
846
847 } else if (le64toh(o->object.size) == osize &&
848 memcmp(o->data.payload, data, size) == 0) {
849
cec736d2
LP
850 if (ret)
851 *ret = o;
852
853 if (offset)
854 *offset = p;
855
de190aef 856 return 1;
cec736d2
LP
857 }
858
85a131e8 859 next:
cec736d2
LP
860 p = le64toh(o->data.next_hash_offset);
861 }
862
de190aef
LP
863 return 0;
864}
865
866int journal_file_find_data_object(
867 JournalFile *f,
868 const void *data, uint64_t size,
869 Object **ret, uint64_t *offset) {
870
871 uint64_t hash;
872
873 assert(f);
874 assert(data || size == 0);
875
876 hash = hash64(data, size);
877
878 return journal_file_find_data_object_with_hash(f,
879 data, size, hash,
880 ret, offset);
881}
882
3c1668da
LP
883static int journal_file_append_field(
884 JournalFile *f,
885 const void *field, uint64_t size,
886 Object **ret, uint64_t *offset) {
887
888 uint64_t hash, p;
889 uint64_t osize;
890 Object *o;
891 int r;
892
893 assert(f);
894 assert(field && size > 0);
895
896 hash = hash64(field, size);
897
898 r = journal_file_find_field_object_with_hash(f, field, size, hash, &o, &p);
899 if (r < 0)
900 return r;
901 else if (r > 0) {
902
903 if (ret)
904 *ret = o;
905
906 if (offset)
907 *offset = p;
908
909 return 0;
910 }
911
912 osize = offsetof(Object, field.payload) + size;
913 r = journal_file_append_object(f, OBJECT_FIELD, osize, &o, &p);
8c92d4bb
LP
914 if (r < 0)
915 return r;
3c1668da
LP
916
917 o->field.hash = htole64(hash);
918 memcpy(o->field.payload, field, size);
919
920 r = journal_file_link_field(f, o, p, hash);
921 if (r < 0)
922 return r;
923
924 /* The linking might have altered the window, so let's
925 * refresh our pointer */
926 r = journal_file_move_to_object(f, OBJECT_FIELD, p, &o);
927 if (r < 0)
928 return r;
929
930#ifdef HAVE_GCRYPT
931 r = journal_file_hmac_put_object(f, OBJECT_FIELD, o, p);
932 if (r < 0)
933 return r;
934#endif
935
936 if (ret)
937 *ret = o;
938
939 if (offset)
940 *offset = p;
941
942 return 0;
943}
944
48496df6
LP
945static int journal_file_append_data(
946 JournalFile *f,
947 const void *data, uint64_t size,
948 Object **ret, uint64_t *offset) {
949
de190aef
LP
950 uint64_t hash, p;
951 uint64_t osize;
952 Object *o;
d89c8fdf 953 int r, compression = 0;
3c1668da 954 const void *eq;
de190aef
LP
955
956 assert(f);
957 assert(data || size == 0);
958
959 hash = hash64(data, size);
960
961 r = journal_file_find_data_object_with_hash(f, data, size, hash, &o, &p);
962 if (r < 0)
963 return r;
964 else if (r > 0) {
965
966 if (ret)
967 *ret = o;
968
969 if (offset)
970 *offset = p;
971
972 return 0;
973 }
974
975 osize = offsetof(Object, data.payload) + size;
976 r = journal_file_append_object(f, OBJECT_DATA, osize, &o, &p);
cec736d2
LP
977 if (r < 0)
978 return r;
979
cec736d2 980 o->data.hash = htole64(hash);
807e17f0 981
d89c8fdf
ZJS
982#if defined(HAVE_XZ) || defined(HAVE_LZ4)
983 if (f->compress_xz &&
807e17f0
LP
984 size >= COMPRESSION_SIZE_THRESHOLD) {
985 uint64_t rsize;
986
d89c8fdf 987 compression = compress_blob(data, size, o->data.payload, &rsize);
807e17f0 988
d89c8fdf 989 if (compression) {
807e17f0 990 o->object.size = htole64(offsetof(Object, data.payload) + rsize);
d89c8fdf 991 o->object.flags |= compression;
807e17f0 992
d89c8fdf
ZJS
993 log_debug("Compressed data object %"PRIu64" -> %"PRIu64" using %s",
994 size, rsize, object_compressed_to_string(compression));
807e17f0
LP
995 }
996 }
997#endif
998
d89c8fdf 999 if (!compression && size > 0)
807e17f0 1000 memcpy(o->data.payload, data, size);
cec736d2 1001
de190aef 1002 r = journal_file_link_data(f, o, p, hash);
cec736d2
LP
1003 if (r < 0)
1004 return r;
1005
48496df6
LP
1006 /* The linking might have altered the window, so let's
1007 * refresh our pointer */
1008 r = journal_file_move_to_object(f, OBJECT_DATA, p, &o);
1009 if (r < 0)
1010 return r;
1011
08c6f819
SL
1012 if (!data)
1013 eq = NULL;
1014 else
1015 eq = memchr(data, '=', size);
3c1668da 1016 if (eq && eq > data) {
748db592 1017 Object *fo = NULL;
3c1668da 1018 uint64_t fp;
3c1668da
LP
1019
1020 /* Create field object ... */
1021 r = journal_file_append_field(f, data, (uint8_t*) eq - (uint8_t*) data, &fo, &fp);
1022 if (r < 0)
1023 return r;
1024
1025 /* ... and link it in. */
1026 o->data.next_field_offset = fo->field.head_data_offset;
1027 fo->field.head_data_offset = le64toh(p);
1028 }
1029
5996c7c2
LP
1030#ifdef HAVE_GCRYPT
1031 r = journal_file_hmac_put_object(f, OBJECT_DATA, o, p);
1032 if (r < 0)
1033 return r;
1034#endif
1035
cec736d2
LP
1036 if (ret)
1037 *ret = o;
1038
1039 if (offset)
de190aef 1040 *offset = p;
cec736d2
LP
1041
1042 return 0;
1043}
1044
1045uint64_t journal_file_entry_n_items(Object *o) {
1046 assert(o);
b588975f
LP
1047
1048 if (o->object.type != OBJECT_ENTRY)
1049 return 0;
cec736d2
LP
1050
1051 return (le64toh(o->object.size) - offsetof(Object, entry.items)) / sizeof(EntryItem);
1052}
1053
0284adc6 1054uint64_t journal_file_entry_array_n_items(Object *o) {
de190aef 1055 assert(o);
b588975f
LP
1056
1057 if (o->object.type != OBJECT_ENTRY_ARRAY)
1058 return 0;
de190aef
LP
1059
1060 return (le64toh(o->object.size) - offsetof(Object, entry_array.items)) / sizeof(uint64_t);
1061}
1062
fb9a24b6
LP
1063uint64_t journal_file_hash_table_n_items(Object *o) {
1064 assert(o);
b588975f
LP
1065
1066 if (o->object.type != OBJECT_DATA_HASH_TABLE &&
1067 o->object.type != OBJECT_FIELD_HASH_TABLE)
1068 return 0;
fb9a24b6
LP
1069
1070 return (le64toh(o->object.size) - offsetof(Object, hash_table.items)) / sizeof(HashItem);
1071}
1072
de190aef 1073static int link_entry_into_array(JournalFile *f,
4fd052ae
FC
1074 le64_t *first,
1075 le64_t *idx,
de190aef 1076 uint64_t p) {
cec736d2 1077 int r;
de190aef
LP
1078 uint64_t n = 0, ap = 0, q, i, a, hidx;
1079 Object *o;
1080
cec736d2 1081 assert(f);
de190aef
LP
1082 assert(first);
1083 assert(idx);
1084 assert(p > 0);
cec736d2 1085
de190aef
LP
1086 a = le64toh(*first);
1087 i = hidx = le64toh(*idx);
1088 while (a > 0) {
1089
1090 r = journal_file_move_to_object(f, OBJECT_ENTRY_ARRAY, a, &o);
1091 if (r < 0)
1092 return r;
cec736d2 1093
de190aef
LP
1094 n = journal_file_entry_array_n_items(o);
1095 if (i < n) {
1096 o->entry_array.items[i] = htole64(p);
1097 *idx = htole64(hidx + 1);
1098 return 0;
1099 }
cec736d2 1100
de190aef
LP
1101 i -= n;
1102 ap = a;
1103 a = le64toh(o->entry_array.next_entry_array_offset);
1104 }
1105
1106 if (hidx > n)
1107 n = (hidx+1) * 2;
1108 else
1109 n = n * 2;
1110
1111 if (n < 4)
1112 n = 4;
1113
1114 r = journal_file_append_object(f, OBJECT_ENTRY_ARRAY,
1115 offsetof(Object, entry_array.items) + n * sizeof(uint64_t),
1116 &o, &q);
cec736d2
LP
1117 if (r < 0)
1118 return r;
1119
feb12d3e 1120#ifdef HAVE_GCRYPT
5996c7c2 1121 r = journal_file_hmac_put_object(f, OBJECT_ENTRY_ARRAY, o, q);
b0af6f41
LP
1122 if (r < 0)
1123 return r;
feb12d3e 1124#endif
b0af6f41 1125
de190aef 1126 o->entry_array.items[i] = htole64(p);
cec736d2 1127
de190aef 1128 if (ap == 0)
7be3aa17 1129 *first = htole64(q);
cec736d2 1130 else {
de190aef 1131 r = journal_file_move_to_object(f, OBJECT_ENTRY_ARRAY, ap, &o);
cec736d2
LP
1132 if (r < 0)
1133 return r;
1134
de190aef
LP
1135 o->entry_array.next_entry_array_offset = htole64(q);
1136 }
cec736d2 1137
2dee23eb
LP
1138 if (JOURNAL_HEADER_CONTAINS(f->header, n_entry_arrays))
1139 f->header->n_entry_arrays = htole64(le64toh(f->header->n_entry_arrays) + 1);
1140
de190aef
LP
1141 *idx = htole64(hidx + 1);
1142
1143 return 0;
1144}
cec736d2 1145
de190aef 1146static int link_entry_into_array_plus_one(JournalFile *f,
4fd052ae
FC
1147 le64_t *extra,
1148 le64_t *first,
1149 le64_t *idx,
de190aef
LP
1150 uint64_t p) {
1151
1152 int r;
1153
1154 assert(f);
1155 assert(extra);
1156 assert(first);
1157 assert(idx);
1158 assert(p > 0);
1159
1160 if (*idx == 0)
1161 *extra = htole64(p);
1162 else {
4fd052ae 1163 le64_t i;
de190aef 1164
7be3aa17 1165 i = htole64(le64toh(*idx) - 1);
de190aef
LP
1166 r = link_entry_into_array(f, first, &i, p);
1167 if (r < 0)
1168 return r;
cec736d2
LP
1169 }
1170
de190aef
LP
1171 *idx = htole64(le64toh(*idx) + 1);
1172 return 0;
1173}
1174
1175static int journal_file_link_entry_item(JournalFile *f, Object *o, uint64_t offset, uint64_t i) {
1176 uint64_t p;
1177 int r;
1178 assert(f);
1179 assert(o);
1180 assert(offset > 0);
1181
1182 p = le64toh(o->entry.items[i].object_offset);
1183 if (p == 0)
1184 return -EINVAL;
1185
1186 r = journal_file_move_to_object(f, OBJECT_DATA, p, &o);
cec736d2
LP
1187 if (r < 0)
1188 return r;
1189
de190aef
LP
1190 return link_entry_into_array_plus_one(f,
1191 &o->data.entry_offset,
1192 &o->data.entry_array_offset,
1193 &o->data.n_entries,
1194 offset);
cec736d2
LP
1195}
1196
1197static int journal_file_link_entry(JournalFile *f, Object *o, uint64_t offset) {
de190aef 1198 uint64_t n, i;
cec736d2
LP
1199 int r;
1200
1201 assert(f);
1202 assert(o);
1203 assert(offset > 0);
b588975f
LP
1204
1205 if (o->object.type != OBJECT_ENTRY)
1206 return -EINVAL;
cec736d2 1207
b788cc23
LP
1208 __sync_synchronize();
1209
cec736d2 1210 /* Link up the entry itself */
de190aef
LP
1211 r = link_entry_into_array(f,
1212 &f->header->entry_array_offset,
1213 &f->header->n_entries,
1214 offset);
1215 if (r < 0)
1216 return r;
cec736d2 1217
507f22bd 1218 /* log_debug("=> %s seqnr=%"PRIu64" n_entries=%"PRIu64, f->path, o->entry.seqnum, f->header->n_entries); */
cec736d2 1219
de190aef 1220 if (f->header->head_entry_realtime == 0)
0ac38b70 1221 f->header->head_entry_realtime = o->entry.realtime;
cec736d2 1222
0ac38b70 1223 f->header->tail_entry_realtime = o->entry.realtime;
de190aef
LP
1224 f->header->tail_entry_monotonic = o->entry.monotonic;
1225
1226 f->tail_entry_monotonic_valid = true;
cec736d2
LP
1227
1228 /* Link up the items */
1229 n = journal_file_entry_n_items(o);
1230 for (i = 0; i < n; i++) {
1231 r = journal_file_link_entry_item(f, o, offset, i);
1232 if (r < 0)
1233 return r;
1234 }
1235
cec736d2
LP
1236 return 0;
1237}
1238
1239static int journal_file_append_entry_internal(
1240 JournalFile *f,
1241 const dual_timestamp *ts,
1242 uint64_t xor_hash,
1243 const EntryItem items[], unsigned n_items,
de190aef 1244 uint64_t *seqnum,
cec736d2
LP
1245 Object **ret, uint64_t *offset) {
1246 uint64_t np;
1247 uint64_t osize;
1248 Object *o;
1249 int r;
1250
1251 assert(f);
1252 assert(items || n_items == 0);
de190aef 1253 assert(ts);
cec736d2
LP
1254
1255 osize = offsetof(Object, entry.items) + (n_items * sizeof(EntryItem));
1256
de190aef 1257 r = journal_file_append_object(f, OBJECT_ENTRY, osize, &o, &np);
cec736d2
LP
1258 if (r < 0)
1259 return r;
1260
d98cc1f2 1261 o->entry.seqnum = htole64(journal_file_entry_seqnum(f, seqnum));
cec736d2 1262 memcpy(o->entry.items, items, n_items * sizeof(EntryItem));
de190aef
LP
1263 o->entry.realtime = htole64(ts->realtime);
1264 o->entry.monotonic = htole64(ts->monotonic);
cec736d2
LP
1265 o->entry.xor_hash = htole64(xor_hash);
1266 o->entry.boot_id = f->header->boot_id;
1267
feb12d3e 1268#ifdef HAVE_GCRYPT
5996c7c2 1269 r = journal_file_hmac_put_object(f, OBJECT_ENTRY, o, np);
b0af6f41
LP
1270 if (r < 0)
1271 return r;
feb12d3e 1272#endif
b0af6f41 1273
cec736d2
LP
1274 r = journal_file_link_entry(f, o, np);
1275 if (r < 0)
1276 return r;
1277
1278 if (ret)
1279 *ret = o;
1280
1281 if (offset)
1282 *offset = np;
1283
1284 return 0;
1285}
1286
cf244689 1287void journal_file_post_change(JournalFile *f) {
50f20cfd
LP
1288 assert(f);
1289
1290 /* inotify() does not receive IN_MODIFY events from file
1291 * accesses done via mmap(). After each access we hence
1292 * trigger IN_MODIFY by truncating the journal file to its
1293 * current size which triggers IN_MODIFY. */
1294
bc85bfee
LP
1295 __sync_synchronize();
1296
50f20cfd 1297 if (ftruncate(f->fd, f->last_stat.st_size) < 0)
c5315881 1298 log_error("Failed to truncate file to its own size: %m");
50f20cfd
LP
1299}
1300
1f2da9ec
LP
1301static int entry_item_cmp(const void *_a, const void *_b) {
1302 const EntryItem *a = _a, *b = _b;
1303
1304 if (le64toh(a->object_offset) < le64toh(b->object_offset))
1305 return -1;
1306 if (le64toh(a->object_offset) > le64toh(b->object_offset))
1307 return 1;
1308 return 0;
1309}
1310
de190aef 1311int journal_file_append_entry(JournalFile *f, const dual_timestamp *ts, const struct iovec iovec[], unsigned n_iovec, uint64_t *seqnum, Object **ret, uint64_t *offset) {
cec736d2
LP
1312 unsigned i;
1313 EntryItem *items;
1314 int r;
1315 uint64_t xor_hash = 0;
de190aef 1316 struct dual_timestamp _ts;
cec736d2
LP
1317
1318 assert(f);
1319 assert(iovec || n_iovec == 0);
1320
de190aef
LP
1321 if (!ts) {
1322 dual_timestamp_get(&_ts);
1323 ts = &_ts;
1324 }
1325
1326 if (f->tail_entry_monotonic_valid &&
1327 ts->monotonic < le64toh(f->header->tail_entry_monotonic))
1328 return -EINVAL;
1329
feb12d3e 1330#ifdef HAVE_GCRYPT
7560fffc
LP
1331 r = journal_file_maybe_append_tag(f, ts->realtime);
1332 if (r < 0)
1333 return r;
feb12d3e 1334#endif
7560fffc 1335
64825d3c 1336 /* alloca() can't take 0, hence let's allocate at least one */
9607d947 1337 items = alloca(sizeof(EntryItem) * MAX(1u, n_iovec));
cec736d2
LP
1338
1339 for (i = 0; i < n_iovec; i++) {
1340 uint64_t p;
1341 Object *o;
1342
1343 r = journal_file_append_data(f, iovec[i].iov_base, iovec[i].iov_len, &o, &p);
1344 if (r < 0)
cf244689 1345 return r;
cec736d2
LP
1346
1347 xor_hash ^= le64toh(o->data.hash);
1348 items[i].object_offset = htole64(p);
de7b95cd 1349 items[i].hash = o->data.hash;
cec736d2
LP
1350 }
1351
1f2da9ec
LP
1352 /* Order by the position on disk, in order to improve seek
1353 * times for rotating media. */
7ff7394d 1354 qsort_safe(items, n_iovec, sizeof(EntryItem), entry_item_cmp);
1f2da9ec 1355
de190aef 1356 r = journal_file_append_entry_internal(f, ts, xor_hash, items, n_iovec, seqnum, ret, offset);
cec736d2 1357
50f20cfd
LP
1358 journal_file_post_change(f);
1359
cec736d2
LP
1360 return r;
1361}
1362
a4bcff5b 1363typedef struct ChainCacheItem {
fb099c8d 1364 uint64_t first; /* the array at the beginning of the chain */
a4bcff5b
LP
1365 uint64_t array; /* the cached array */
1366 uint64_t begin; /* the first item in the cached array */
1367 uint64_t total; /* the total number of items in all arrays before this one in the chain */
f268980d 1368 uint64_t last_index; /* the last index we looked at, to optimize locality when bisecting */
a4bcff5b
LP
1369} ChainCacheItem;
1370
1371static void chain_cache_put(
1372 Hashmap *h,
1373 ChainCacheItem *ci,
1374 uint64_t first,
1375 uint64_t array,
1376 uint64_t begin,
f268980d
LP
1377 uint64_t total,
1378 uint64_t last_index) {
a4bcff5b
LP
1379
1380 if (!ci) {
34741aa3
LP
1381 /* If the chain item to cache for this chain is the
1382 * first one it's not worth caching anything */
1383 if (array == first)
1384 return;
1385
a4bcff5b
LP
1386 if (hashmap_size(h) >= CHAIN_CACHE_MAX)
1387 ci = hashmap_steal_first(h);
1388 else {
1389 ci = new(ChainCacheItem, 1);
1390 if (!ci)
1391 return;
1392 }
1393
1394 ci->first = first;
1395
1396 if (hashmap_put(h, &ci->first, ci) < 0) {
1397 free(ci);
1398 return;
1399 }
1400 } else
1401 assert(ci->first == first);
1402
1403 ci->array = array;
1404 ci->begin = begin;
1405 ci->total = total;
f268980d 1406 ci->last_index = last_index;
a4bcff5b
LP
1407}
1408
f268980d
LP
1409static int generic_array_get(
1410 JournalFile *f,
1411 uint64_t first,
1412 uint64_t i,
1413 Object **ret, uint64_t *offset) {
de190aef 1414
cec736d2 1415 Object *o;
a4bcff5b 1416 uint64_t p = 0, a, t = 0;
cec736d2 1417 int r;
a4bcff5b 1418 ChainCacheItem *ci;
cec736d2
LP
1419
1420 assert(f);
1421
de190aef 1422 a = first;
a4bcff5b
LP
1423
1424 /* Try the chain cache first */
1425 ci = hashmap_get(f->chain_cache, &first);
1426 if (ci && i > ci->total) {
1427 a = ci->array;
1428 i -= ci->total;
1429 t = ci->total;
1430 }
1431
de190aef 1432 while (a > 0) {
a4bcff5b 1433 uint64_t k;
cec736d2 1434
de190aef
LP
1435 r = journal_file_move_to_object(f, OBJECT_ENTRY_ARRAY, a, &o);
1436 if (r < 0)
1437 return r;
cec736d2 1438
a4bcff5b
LP
1439 k = journal_file_entry_array_n_items(o);
1440 if (i < k) {
de190aef 1441 p = le64toh(o->entry_array.items[i]);
a4bcff5b 1442 goto found;
cec736d2
LP
1443 }
1444
a4bcff5b
LP
1445 i -= k;
1446 t += k;
de190aef
LP
1447 a = le64toh(o->entry_array.next_entry_array_offset);
1448 }
1449
a4bcff5b
LP
1450 return 0;
1451
1452found:
1453 /* Let's cache this item for the next invocation */
af13a6b0 1454 chain_cache_put(f->chain_cache, ci, first, a, le64toh(o->entry_array.items[0]), t, i);
de190aef
LP
1455
1456 r = journal_file_move_to_object(f, OBJECT_ENTRY, p, &o);
1457 if (r < 0)
1458 return r;
1459
1460 if (ret)
1461 *ret = o;
1462
1463 if (offset)
1464 *offset = p;
1465
1466 return 1;
1467}
1468
f268980d
LP
1469static int generic_array_get_plus_one(
1470 JournalFile *f,
1471 uint64_t extra,
1472 uint64_t first,
1473 uint64_t i,
1474 Object **ret, uint64_t *offset) {
de190aef
LP
1475
1476 Object *o;
1477
1478 assert(f);
1479
1480 if (i == 0) {
1481 int r;
1482
1483 r = journal_file_move_to_object(f, OBJECT_ENTRY, extra, &o);
cec736d2
LP
1484 if (r < 0)
1485 return r;
1486
de190aef
LP
1487 if (ret)
1488 *ret = o;
cec736d2 1489
de190aef
LP
1490 if (offset)
1491 *offset = extra;
cec736d2 1492
de190aef 1493 return 1;
cec736d2
LP
1494 }
1495
de190aef
LP
1496 return generic_array_get(f, first, i-1, ret, offset);
1497}
cec736d2 1498
de190aef
LP
1499enum {
1500 TEST_FOUND,
1501 TEST_LEFT,
1502 TEST_RIGHT
1503};
cec736d2 1504
f268980d
LP
1505static int generic_array_bisect(
1506 JournalFile *f,
1507 uint64_t first,
1508 uint64_t n,
1509 uint64_t needle,
1510 int (*test_object)(JournalFile *f, uint64_t p, uint64_t needle),
1511 direction_t direction,
1512 Object **ret,
1513 uint64_t *offset,
1514 uint64_t *idx) {
1515
1516 uint64_t a, p, t = 0, i = 0, last_p = 0, last_index = (uint64_t) -1;
de190aef
LP
1517 bool subtract_one = false;
1518 Object *o, *array = NULL;
1519 int r;
a4bcff5b 1520 ChainCacheItem *ci;
cec736d2 1521
de190aef
LP
1522 assert(f);
1523 assert(test_object);
cec736d2 1524
a4bcff5b 1525 /* Start with the first array in the chain */
de190aef 1526 a = first;
a4bcff5b
LP
1527
1528 ci = hashmap_get(f->chain_cache, &first);
1529 if (ci && n > ci->total) {
1530 /* Ah, we have iterated this bisection array chain
1531 * previously! Let's see if we can skip ahead in the
1532 * chain, as far as the last time. But we can't jump
1533 * backwards in the chain, so let's check that
1534 * first. */
1535
1536 r = test_object(f, ci->begin, needle);
1537 if (r < 0)
1538 return r;
1539
1540 if (r == TEST_LEFT) {
f268980d 1541 /* OK, what we are looking for is right of the
a4bcff5b
LP
1542 * begin of this EntryArray, so let's jump
1543 * straight to previously cached array in the
1544 * chain */
1545
1546 a = ci->array;
1547 n -= ci->total;
1548 t = ci->total;
f268980d 1549 last_index = ci->last_index;
a4bcff5b
LP
1550 }
1551 }
1552
de190aef
LP
1553 while (a > 0) {
1554 uint64_t left, right, k, lp;
1555
1556 r = journal_file_move_to_object(f, OBJECT_ENTRY_ARRAY, a, &array);
cec736d2
LP
1557 if (r < 0)
1558 return r;
1559
de190aef
LP
1560 k = journal_file_entry_array_n_items(array);
1561 right = MIN(k, n);
1562 if (right <= 0)
1563 return 0;
cec736d2 1564
de190aef
LP
1565 i = right - 1;
1566 lp = p = le64toh(array->entry_array.items[i]);
1567 if (p <= 0)
1568 return -EBADMSG;
cec736d2 1569
de190aef
LP
1570 r = test_object(f, p, needle);
1571 if (r < 0)
1572 return r;
cec736d2 1573
de190aef
LP
1574 if (r == TEST_FOUND)
1575 r = direction == DIRECTION_DOWN ? TEST_RIGHT : TEST_LEFT;
1576
1577 if (r == TEST_RIGHT) {
1578 left = 0;
1579 right -= 1;
f268980d
LP
1580
1581 if (last_index != (uint64_t) -1) {
1582 assert(last_index <= right);
1583
1584 /* If we cached the last index we
1585 * looked at, let's try to not to jump
1586 * too wildly around and see if we can
1587 * limit the range to look at early to
1588 * the immediate neighbors of the last
1589 * index we looked at. */
1590
1591 if (last_index > 0) {
1592 uint64_t x = last_index - 1;
1593
1594 p = le64toh(array->entry_array.items[x]);
1595 if (p <= 0)
1596 return -EBADMSG;
1597
1598 r = test_object(f, p, needle);
1599 if (r < 0)
1600 return r;
1601
1602 if (r == TEST_FOUND)
1603 r = direction == DIRECTION_DOWN ? TEST_RIGHT : TEST_LEFT;
1604
1605 if (r == TEST_RIGHT)
1606 right = x;
1607 else
1608 left = x + 1;
1609 }
1610
1611 if (last_index < right) {
1612 uint64_t y = last_index + 1;
1613
1614 p = le64toh(array->entry_array.items[y]);
1615 if (p <= 0)
1616 return -EBADMSG;
1617
1618 r = test_object(f, p, needle);
1619 if (r < 0)
1620 return r;
1621
1622 if (r == TEST_FOUND)
1623 r = direction == DIRECTION_DOWN ? TEST_RIGHT : TEST_LEFT;
1624
1625 if (r == TEST_RIGHT)
1626 right = y;
1627 else
1628 left = y + 1;
1629 }
f268980d
LP
1630 }
1631
de190aef
LP
1632 for (;;) {
1633 if (left == right) {
1634 if (direction == DIRECTION_UP)
1635 subtract_one = true;
1636
1637 i = left;
1638 goto found;
1639 }
1640
1641 assert(left < right);
de190aef 1642 i = (left + right) / 2;
f268980d 1643
de190aef
LP
1644 p = le64toh(array->entry_array.items[i]);
1645 if (p <= 0)
1646 return -EBADMSG;
1647
1648 r = test_object(f, p, needle);
1649 if (r < 0)
1650 return r;
cec736d2 1651
de190aef
LP
1652 if (r == TEST_FOUND)
1653 r = direction == DIRECTION_DOWN ? TEST_RIGHT : TEST_LEFT;
1654
1655 if (r == TEST_RIGHT)
1656 right = i;
1657 else
1658 left = i + 1;
1659 }
1660 }
1661
cbdca852
LP
1662 if (k > n) {
1663 if (direction == DIRECTION_UP) {
1664 i = n;
1665 subtract_one = true;
1666 goto found;
1667 }
1668
cec736d2 1669 return 0;
cbdca852 1670 }
cec736d2 1671
de190aef
LP
1672 last_p = lp;
1673
1674 n -= k;
1675 t += k;
f268980d 1676 last_index = (uint64_t) -1;
de190aef 1677 a = le64toh(array->entry_array.next_entry_array_offset);
cec736d2
LP
1678 }
1679
1680 return 0;
de190aef
LP
1681
1682found:
1683 if (subtract_one && t == 0 && i == 0)
1684 return 0;
1685
a4bcff5b 1686 /* Let's cache this item for the next invocation */
af13a6b0 1687 chain_cache_put(f->chain_cache, ci, first, a, le64toh(array->entry_array.items[0]), t, subtract_one ? (i > 0 ? i-1 : (uint64_t) -1) : i);
a4bcff5b 1688
de190aef
LP
1689 if (subtract_one && i == 0)
1690 p = last_p;
1691 else if (subtract_one)
1692 p = le64toh(array->entry_array.items[i-1]);
1693 else
1694 p = le64toh(array->entry_array.items[i]);
1695
1696 r = journal_file_move_to_object(f, OBJECT_ENTRY, p, &o);
1697 if (r < 0)
1698 return r;
1699
1700 if (ret)
1701 *ret = o;
1702
1703 if (offset)
1704 *offset = p;
1705
1706 if (idx)
cbdca852 1707 *idx = t + i + (subtract_one ? -1 : 0);
de190aef
LP
1708
1709 return 1;
cec736d2
LP
1710}
1711
f268980d
LP
1712
1713static int generic_array_bisect_plus_one(
1714 JournalFile *f,
1715 uint64_t extra,
1716 uint64_t first,
1717 uint64_t n,
1718 uint64_t needle,
1719 int (*test_object)(JournalFile *f, uint64_t p, uint64_t needle),
1720 direction_t direction,
1721 Object **ret,
1722 uint64_t *offset,
1723 uint64_t *idx) {
de190aef 1724
cec736d2 1725 int r;
cbdca852
LP
1726 bool step_back = false;
1727 Object *o;
cec736d2
LP
1728
1729 assert(f);
de190aef 1730 assert(test_object);
cec736d2 1731
de190aef
LP
1732 if (n <= 0)
1733 return 0;
cec736d2 1734
de190aef
LP
1735 /* This bisects the array in object 'first', but first checks
1736 * an extra */
de190aef
LP
1737 r = test_object(f, extra, needle);
1738 if (r < 0)
1739 return r;
a536e261
LP
1740
1741 if (r == TEST_FOUND)
1742 r = direction == DIRECTION_DOWN ? TEST_RIGHT : TEST_LEFT;
1743
cbdca852
LP
1744 /* if we are looking with DIRECTION_UP then we need to first
1745 see if in the actual array there is a matching entry, and
1746 return the last one of that. But if there isn't any we need
1747 to return this one. Hence remember this, and return it
1748 below. */
1749 if (r == TEST_LEFT)
1750 step_back = direction == DIRECTION_UP;
de190aef 1751
cbdca852
LP
1752 if (r == TEST_RIGHT) {
1753 if (direction == DIRECTION_DOWN)
1754 goto found;
1755 else
1756 return 0;
a536e261 1757 }
cec736d2 1758
de190aef
LP
1759 r = generic_array_bisect(f, first, n-1, needle, test_object, direction, ret, offset, idx);
1760
cbdca852
LP
1761 if (r == 0 && step_back)
1762 goto found;
1763
ecf68b1d 1764 if (r > 0 && idx)
de190aef
LP
1765 (*idx) ++;
1766
1767 return r;
cbdca852
LP
1768
1769found:
1770 r = journal_file_move_to_object(f, OBJECT_ENTRY, extra, &o);
1771 if (r < 0)
1772 return r;
1773
1774 if (ret)
1775 *ret = o;
1776
1777 if (offset)
1778 *offset = extra;
1779
1780 if (idx)
1781 *idx = 0;
1782
1783 return 1;
1784}
1785
44a6b1b6 1786_pure_ static int test_object_offset(JournalFile *f, uint64_t p, uint64_t needle) {
cbdca852
LP
1787 assert(f);
1788 assert(p > 0);
1789
1790 if (p == needle)
1791 return TEST_FOUND;
1792 else if (p < needle)
1793 return TEST_LEFT;
1794 else
1795 return TEST_RIGHT;
1796}
1797
1798int journal_file_move_to_entry_by_offset(
1799 JournalFile *f,
1800 uint64_t p,
1801 direction_t direction,
1802 Object **ret,
1803 uint64_t *offset) {
1804
1805 return generic_array_bisect(f,
1806 le64toh(f->header->entry_array_offset),
1807 le64toh(f->header->n_entries),
1808 p,
1809 test_object_offset,
1810 direction,
1811 ret, offset, NULL);
de190aef
LP
1812}
1813
cbdca852 1814
de190aef
LP
1815static int test_object_seqnum(JournalFile *f, uint64_t p, uint64_t needle) {
1816 Object *o;
1817 int r;
1818
1819 assert(f);
1820 assert(p > 0);
1821
1822 r = journal_file_move_to_object(f, OBJECT_ENTRY, p, &o);
cec736d2
LP
1823 if (r < 0)
1824 return r;
1825
de190aef
LP
1826 if (le64toh(o->entry.seqnum) == needle)
1827 return TEST_FOUND;
1828 else if (le64toh(o->entry.seqnum) < needle)
1829 return TEST_LEFT;
1830 else
1831 return TEST_RIGHT;
1832}
cec736d2 1833
de190aef
LP
1834int journal_file_move_to_entry_by_seqnum(
1835 JournalFile *f,
1836 uint64_t seqnum,
1837 direction_t direction,
1838 Object **ret,
1839 uint64_t *offset) {
1840
1841 return generic_array_bisect(f,
1842 le64toh(f->header->entry_array_offset),
1843 le64toh(f->header->n_entries),
1844 seqnum,
1845 test_object_seqnum,
1846 direction,
1847 ret, offset, NULL);
1848}
cec736d2 1849
de190aef
LP
1850static int test_object_realtime(JournalFile *f, uint64_t p, uint64_t needle) {
1851 Object *o;
1852 int r;
1853
1854 assert(f);
1855 assert(p > 0);
1856
1857 r = journal_file_move_to_object(f, OBJECT_ENTRY, p, &o);
1858 if (r < 0)
1859 return r;
1860
1861 if (le64toh(o->entry.realtime) == needle)
1862 return TEST_FOUND;
1863 else if (le64toh(o->entry.realtime) < needle)
1864 return TEST_LEFT;
1865 else
1866 return TEST_RIGHT;
cec736d2
LP
1867}
1868
de190aef
LP
1869int journal_file_move_to_entry_by_realtime(
1870 JournalFile *f,
1871 uint64_t realtime,
1872 direction_t direction,
1873 Object **ret,
1874 uint64_t *offset) {
1875
1876 return generic_array_bisect(f,
1877 le64toh(f->header->entry_array_offset),
1878 le64toh(f->header->n_entries),
1879 realtime,
1880 test_object_realtime,
1881 direction,
1882 ret, offset, NULL);
1883}
1884
1885static int test_object_monotonic(JournalFile *f, uint64_t p, uint64_t needle) {
1886 Object *o;
1887 int r;
1888
1889 assert(f);
1890 assert(p > 0);
1891
1892 r = journal_file_move_to_object(f, OBJECT_ENTRY, p, &o);
1893 if (r < 0)
1894 return r;
1895
1896 if (le64toh(o->entry.monotonic) == needle)
1897 return TEST_FOUND;
1898 else if (le64toh(o->entry.monotonic) < needle)
1899 return TEST_LEFT;
1900 else
1901 return TEST_RIGHT;
1902}
1903
47838ab3
ZJS
1904static inline int find_data_object_by_boot_id(
1905 JournalFile *f,
1906 sd_id128_t boot_id,
1907 Object **o,
1908 uint64_t *b) {
1909 char t[sizeof("_BOOT_ID=")-1 + 32 + 1] = "_BOOT_ID=";
1910
1911 sd_id128_to_string(boot_id, t + 9);
1912 return journal_file_find_data_object(f, t, sizeof(t) - 1, o, b);
1913}
1914
de190aef
LP
1915int journal_file_move_to_entry_by_monotonic(
1916 JournalFile *f,
1917 sd_id128_t boot_id,
1918 uint64_t monotonic,
1919 direction_t direction,
1920 Object **ret,
1921 uint64_t *offset) {
1922
de190aef
LP
1923 Object *o;
1924 int r;
1925
cbdca852 1926 assert(f);
de190aef 1927
47838ab3 1928 r = find_data_object_by_boot_id(f, boot_id, &o, NULL);
de190aef
LP
1929 if (r < 0)
1930 return r;
cbdca852 1931 if (r == 0)
de190aef
LP
1932 return -ENOENT;
1933
1934 return generic_array_bisect_plus_one(f,
1935 le64toh(o->data.entry_offset),
1936 le64toh(o->data.entry_array_offset),
1937 le64toh(o->data.n_entries),
1938 monotonic,
1939 test_object_monotonic,
1940 direction,
1941 ret, offset, NULL);
1942}
1943
de190aef
LP
1944int journal_file_next_entry(
1945 JournalFile *f,
1946 Object *o, uint64_t p,
1947 direction_t direction,
1948 Object **ret, uint64_t *offset) {
1949
fb099c8d 1950 uint64_t i, n, ofs;
cec736d2
LP
1951 int r;
1952
1953 assert(f);
de190aef
LP
1954 assert(p > 0 || !o);
1955
1956 n = le64toh(f->header->n_entries);
1957 if (n <= 0)
1958 return 0;
cec736d2
LP
1959
1960 if (!o)
de190aef 1961 i = direction == DIRECTION_DOWN ? 0 : n - 1;
cec736d2 1962 else {
de190aef 1963 if (o->object.type != OBJECT_ENTRY)
cec736d2
LP
1964 return -EINVAL;
1965
de190aef
LP
1966 r = generic_array_bisect(f,
1967 le64toh(f->header->entry_array_offset),
1968 le64toh(f->header->n_entries),
1969 p,
1970 test_object_offset,
1971 DIRECTION_DOWN,
1972 NULL, NULL,
1973 &i);
1974 if (r <= 0)
1975 return r;
1976
1977 if (direction == DIRECTION_DOWN) {
1978 if (i >= n - 1)
1979 return 0;
1980
1981 i++;
1982 } else {
1983 if (i <= 0)
1984 return 0;
1985
1986 i--;
1987 }
cec736d2
LP
1988 }
1989
de190aef 1990 /* And jump to it */
fb099c8d
ZJS
1991 r = generic_array_get(f,
1992 le64toh(f->header->entry_array_offset),
1993 i,
1994 ret, &ofs);
1995 if (r <= 0)
1996 return r;
1997
1998 if (p > 0 &&
1999 (direction == DIRECTION_DOWN ? ofs <= p : ofs >= p)) {
2000 log_debug("%s: entry array corrupted at entry %"PRIu64,
2001 f->path, i);
2002 return -EBADMSG;
2003 }
2004
2005 if (offset)
2006 *offset = ofs;
2007
2008 return 1;
de190aef 2009}
cec736d2 2010
de190aef
LP
2011int journal_file_skip_entry(
2012 JournalFile *f,
2013 Object *o, uint64_t p,
2014 int64_t skip,
2015 Object **ret, uint64_t *offset) {
2016
2017 uint64_t i, n;
2018 int r;
2019
2020 assert(f);
2021 assert(o);
2022 assert(p > 0);
2023
2024 if (o->object.type != OBJECT_ENTRY)
2025 return -EINVAL;
2026
2027 r = generic_array_bisect(f,
2028 le64toh(f->header->entry_array_offset),
2029 le64toh(f->header->n_entries),
2030 p,
2031 test_object_offset,
2032 DIRECTION_DOWN,
2033 NULL, NULL,
2034 &i);
2035 if (r <= 0)
cec736d2
LP
2036 return r;
2037
de190aef
LP
2038 /* Calculate new index */
2039 if (skip < 0) {
2040 if ((uint64_t) -skip >= i)
2041 i = 0;
2042 else
2043 i = i - (uint64_t) -skip;
2044 } else
2045 i += (uint64_t) skip;
cec736d2 2046
de190aef
LP
2047 n = le64toh(f->header->n_entries);
2048 if (n <= 0)
2049 return -EBADMSG;
cec736d2 2050
de190aef
LP
2051 if (i >= n)
2052 i = n-1;
2053
2054 return generic_array_get(f,
2055 le64toh(f->header->entry_array_offset),
2056 i,
2057 ret, offset);
cec736d2
LP
2058}
2059
de190aef
LP
2060int journal_file_next_entry_for_data(
2061 JournalFile *f,
2062 Object *o, uint64_t p,
2063 uint64_t data_offset,
2064 direction_t direction,
2065 Object **ret, uint64_t *offset) {
2066
2067 uint64_t n, i;
cec736d2 2068 int r;
de190aef 2069 Object *d;
cec736d2
LP
2070
2071 assert(f);
de190aef 2072 assert(p > 0 || !o);
cec736d2 2073
de190aef 2074 r = journal_file_move_to_object(f, OBJECT_DATA, data_offset, &d);
466ccd92 2075 if (r < 0)
de190aef 2076 return r;
cec736d2 2077
de190aef
LP
2078 n = le64toh(d->data.n_entries);
2079 if (n <= 0)
2080 return n;
cec736d2 2081
de190aef
LP
2082 if (!o)
2083 i = direction == DIRECTION_DOWN ? 0 : n - 1;
2084 else {
2085 if (o->object.type != OBJECT_ENTRY)
2086 return -EINVAL;
cec736d2 2087
de190aef
LP
2088 r = generic_array_bisect_plus_one(f,
2089 le64toh(d->data.entry_offset),
2090 le64toh(d->data.entry_array_offset),
2091 le64toh(d->data.n_entries),
2092 p,
2093 test_object_offset,
2094 DIRECTION_DOWN,
2095 NULL, NULL,
2096 &i);
2097
2098 if (r <= 0)
cec736d2
LP
2099 return r;
2100
de190aef
LP
2101 if (direction == DIRECTION_DOWN) {
2102 if (i >= n - 1)
2103 return 0;
cec736d2 2104
de190aef
LP
2105 i++;
2106 } else {
2107 if (i <= 0)
2108 return 0;
cec736d2 2109
de190aef
LP
2110 i--;
2111 }
cec736d2 2112
de190aef 2113 }
cec736d2 2114
de190aef
LP
2115 return generic_array_get_plus_one(f,
2116 le64toh(d->data.entry_offset),
2117 le64toh(d->data.entry_array_offset),
2118 i,
2119 ret, offset);
2120}
cec736d2 2121
cbdca852
LP
2122int journal_file_move_to_entry_by_offset_for_data(
2123 JournalFile *f,
2124 uint64_t data_offset,
2125 uint64_t p,
2126 direction_t direction,
2127 Object **ret, uint64_t *offset) {
2128
2129 int r;
2130 Object *d;
2131
2132 assert(f);
2133
2134 r = journal_file_move_to_object(f, OBJECT_DATA, data_offset, &d);
2135 if (r < 0)
2136 return r;
2137
2138 return generic_array_bisect_plus_one(f,
2139 le64toh(d->data.entry_offset),
2140 le64toh(d->data.entry_array_offset),
2141 le64toh(d->data.n_entries),
2142 p,
2143 test_object_offset,
2144 direction,
2145 ret, offset, NULL);
2146}
2147
2148int journal_file_move_to_entry_by_monotonic_for_data(
2149 JournalFile *f,
2150 uint64_t data_offset,
2151 sd_id128_t boot_id,
2152 uint64_t monotonic,
2153 direction_t direction,
2154 Object **ret, uint64_t *offset) {
2155
cbdca852
LP
2156 Object *o, *d;
2157 int r;
2158 uint64_t b, z;
2159
2160 assert(f);
2161
2162 /* First, seek by time */
47838ab3 2163 r = find_data_object_by_boot_id(f, boot_id, &o, &b);
cbdca852
LP
2164 if (r < 0)
2165 return r;
2166 if (r == 0)
2167 return -ENOENT;
2168
2169 r = generic_array_bisect_plus_one(f,
2170 le64toh(o->data.entry_offset),
2171 le64toh(o->data.entry_array_offset),
2172 le64toh(o->data.n_entries),
2173 monotonic,
2174 test_object_monotonic,
2175 direction,
2176 NULL, &z, NULL);
2177 if (r <= 0)
2178 return r;
2179
2180 /* And now, continue seeking until we find an entry that
2181 * exists in both bisection arrays */
2182
2183 for (;;) {
2184 Object *qo;
2185 uint64_t p, q;
2186
2187 r = journal_file_move_to_object(f, OBJECT_DATA, data_offset, &d);
2188 if (r < 0)
2189 return r;
2190
2191 r = generic_array_bisect_plus_one(f,
2192 le64toh(d->data.entry_offset),
2193 le64toh(d->data.entry_array_offset),
2194 le64toh(d->data.n_entries),
2195 z,
2196 test_object_offset,
2197 direction,
2198 NULL, &p, NULL);
2199 if (r <= 0)
2200 return r;
2201
2202 r = journal_file_move_to_object(f, OBJECT_DATA, b, &o);
2203 if (r < 0)
2204 return r;
2205
2206 r = generic_array_bisect_plus_one(f,
2207 le64toh(o->data.entry_offset),
2208 le64toh(o->data.entry_array_offset),
2209 le64toh(o->data.n_entries),
2210 p,
2211 test_object_offset,
2212 direction,
2213 &qo, &q, NULL);
2214
2215 if (r <= 0)
2216 return r;
2217
2218 if (p == q) {
2219 if (ret)
2220 *ret = qo;
2221 if (offset)
2222 *offset = q;
2223
2224 return 1;
2225 }
2226
2227 z = q;
2228 }
cbdca852
LP
2229}
2230
de190aef
LP
2231int journal_file_move_to_entry_by_seqnum_for_data(
2232 JournalFile *f,
2233 uint64_t data_offset,
2234 uint64_t seqnum,
2235 direction_t direction,
2236 Object **ret, uint64_t *offset) {
cec736d2 2237
de190aef
LP
2238 Object *d;
2239 int r;
cec736d2 2240
91a31dde
LP
2241 assert(f);
2242
de190aef 2243 r = journal_file_move_to_object(f, OBJECT_DATA, data_offset, &d);
91a31dde 2244 if (r < 0)
de190aef 2245 return r;
cec736d2 2246
de190aef
LP
2247 return generic_array_bisect_plus_one(f,
2248 le64toh(d->data.entry_offset),
2249 le64toh(d->data.entry_array_offset),
2250 le64toh(d->data.n_entries),
2251 seqnum,
2252 test_object_seqnum,
2253 direction,
2254 ret, offset, NULL);
2255}
cec736d2 2256
de190aef
LP
2257int journal_file_move_to_entry_by_realtime_for_data(
2258 JournalFile *f,
2259 uint64_t data_offset,
2260 uint64_t realtime,
2261 direction_t direction,
2262 Object **ret, uint64_t *offset) {
2263
2264 Object *d;
2265 int r;
2266
91a31dde
LP
2267 assert(f);
2268
de190aef 2269 r = journal_file_move_to_object(f, OBJECT_DATA, data_offset, &d);
91a31dde 2270 if (r < 0)
de190aef
LP
2271 return r;
2272
2273 return generic_array_bisect_plus_one(f,
2274 le64toh(d->data.entry_offset),
2275 le64toh(d->data.entry_array_offset),
2276 le64toh(d->data.n_entries),
2277 realtime,
2278 test_object_realtime,
2279 direction,
2280 ret, offset, NULL);
cec736d2
LP
2281}
2282
0284adc6 2283void journal_file_dump(JournalFile *f) {
7560fffc 2284 Object *o;
7560fffc 2285 int r;
0284adc6 2286 uint64_t p;
7560fffc
LP
2287
2288 assert(f);
2289
0284adc6 2290 journal_file_print_header(f);
7560fffc 2291
0284adc6
LP
2292 p = le64toh(f->header->header_size);
2293 while (p != 0) {
2294 r = journal_file_move_to_object(f, -1, p, &o);
2295 if (r < 0)
2296 goto fail;
7560fffc 2297
0284adc6 2298 switch (o->object.type) {
d98cc1f2 2299
0284adc6
LP
2300 case OBJECT_UNUSED:
2301 printf("Type: OBJECT_UNUSED\n");
2302 break;
d98cc1f2 2303
0284adc6
LP
2304 case OBJECT_DATA:
2305 printf("Type: OBJECT_DATA\n");
2306 break;
7560fffc 2307
3c1668da
LP
2308 case OBJECT_FIELD:
2309 printf("Type: OBJECT_FIELD\n");
2310 break;
2311
0284adc6 2312 case OBJECT_ENTRY:
507f22bd
ZJS
2313 printf("Type: OBJECT_ENTRY seqnum=%"PRIu64" monotonic=%"PRIu64" realtime=%"PRIu64"\n",
2314 le64toh(o->entry.seqnum),
2315 le64toh(o->entry.monotonic),
2316 le64toh(o->entry.realtime));
0284adc6 2317 break;
7560fffc 2318
0284adc6
LP
2319 case OBJECT_FIELD_HASH_TABLE:
2320 printf("Type: OBJECT_FIELD_HASH_TABLE\n");
2321 break;
7560fffc 2322
0284adc6
LP
2323 case OBJECT_DATA_HASH_TABLE:
2324 printf("Type: OBJECT_DATA_HASH_TABLE\n");
2325 break;
7560fffc 2326
0284adc6
LP
2327 case OBJECT_ENTRY_ARRAY:
2328 printf("Type: OBJECT_ENTRY_ARRAY\n");
2329 break;
7560fffc 2330
0284adc6 2331 case OBJECT_TAG:
507f22bd
ZJS
2332 printf("Type: OBJECT_TAG seqnum=%"PRIu64" epoch=%"PRIu64"\n",
2333 le64toh(o->tag.seqnum),
2334 le64toh(o->tag.epoch));
0284adc6 2335 break;
3c1668da
LP
2336
2337 default:
2338 printf("Type: unknown (%u)\n", o->object.type);
2339 break;
0284adc6 2340 }
7560fffc 2341
d89c8fdf
ZJS
2342 if (o->object.flags & OBJECT_COMPRESSION_MASK)
2343 printf("Flags: %s\n",
2344 object_compressed_to_string(o->object.flags & OBJECT_COMPRESSION_MASK));
7560fffc 2345
0284adc6
LP
2346 if (p == le64toh(f->header->tail_object_offset))
2347 p = 0;
2348 else
2349 p = p + ALIGN64(le64toh(o->object.size));
2350 }
7560fffc 2351
0284adc6
LP
2352 return;
2353fail:
2354 log_error("File corrupt");
7560fffc
LP
2355}
2356
718fe4b1
ZJS
2357static const char* format_timestamp_safe(char *buf, size_t l, usec_t t) {
2358 const char *x;
2359
2360 x = format_timestamp(buf, l, t);
2361 if (x)
2362 return x;
2363 return " --- ";
2364}
2365
0284adc6 2366void journal_file_print_header(JournalFile *f) {
2765b7bb 2367 char a[33], b[33], c[33], d[33];
ed375beb 2368 char x[FORMAT_TIMESTAMP_MAX], y[FORMAT_TIMESTAMP_MAX], z[FORMAT_TIMESTAMP_MAX];
a1a03e30
LP
2369 struct stat st;
2370 char bytes[FORMAT_BYTES_MAX];
7560fffc
LP
2371
2372 assert(f);
7560fffc 2373
0284adc6
LP
2374 printf("File Path: %s\n"
2375 "File ID: %s\n"
2376 "Machine ID: %s\n"
2377 "Boot ID: %s\n"
2378 "Sequential Number ID: %s\n"
2379 "State: %s\n"
2380 "Compatible Flags:%s%s\n"
d89c8fdf 2381 "Incompatible Flags:%s%s%s\n"
507f22bd
ZJS
2382 "Header size: %"PRIu64"\n"
2383 "Arena size: %"PRIu64"\n"
2384 "Data Hash Table Size: %"PRIu64"\n"
2385 "Field Hash Table Size: %"PRIu64"\n"
0284adc6 2386 "Rotate Suggested: %s\n"
507f22bd
ZJS
2387 "Head Sequential Number: %"PRIu64"\n"
2388 "Tail Sequential Number: %"PRIu64"\n"
0284adc6 2389 "Head Realtime Timestamp: %s\n"
3223f44f 2390 "Tail Realtime Timestamp: %s\n"
ed375beb 2391 "Tail Monotonic Timestamp: %s\n"
507f22bd
ZJS
2392 "Objects: %"PRIu64"\n"
2393 "Entry Objects: %"PRIu64"\n",
0284adc6
LP
2394 f->path,
2395 sd_id128_to_string(f->header->file_id, a),
2396 sd_id128_to_string(f->header->machine_id, b),
2397 sd_id128_to_string(f->header->boot_id, c),
2765b7bb 2398 sd_id128_to_string(f->header->seqnum_id, d),
3223f44f
LP
2399 f->header->state == STATE_OFFLINE ? "OFFLINE" :
2400 f->header->state == STATE_ONLINE ? "ONLINE" :
2401 f->header->state == STATE_ARCHIVED ? "ARCHIVED" : "UNKNOWN",
8088cbd3 2402 JOURNAL_HEADER_SEALED(f->header) ? " SEALED" : "",
d89c8fdf
ZJS
2403 (le32toh(f->header->compatible_flags) & ~HEADER_COMPATIBLE_ANY) ? " ???" : "",
2404 JOURNAL_HEADER_COMPRESSED_XZ(f->header) ? " COMPRESSED-XZ" : "",
2405 JOURNAL_HEADER_COMPRESSED_LZ4(f->header) ? " COMPRESSED-LZ4" : "",
2406 (le32toh(f->header->incompatible_flags) & ~HEADER_INCOMPATIBLE_ANY) ? " ???" : "",
507f22bd
ZJS
2407 le64toh(f->header->header_size),
2408 le64toh(f->header->arena_size),
2409 le64toh(f->header->data_hash_table_size) / sizeof(HashItem),
2410 le64toh(f->header->field_hash_table_size) / sizeof(HashItem),
fb0951b0 2411 yes_no(journal_file_rotate_suggested(f, 0)),
507f22bd
ZJS
2412 le64toh(f->header->head_entry_seqnum),
2413 le64toh(f->header->tail_entry_seqnum),
718fe4b1
ZJS
2414 format_timestamp_safe(x, sizeof(x), le64toh(f->header->head_entry_realtime)),
2415 format_timestamp_safe(y, sizeof(y), le64toh(f->header->tail_entry_realtime)),
ed375beb 2416 format_timespan(z, sizeof(z), le64toh(f->header->tail_entry_monotonic), USEC_PER_MSEC),
507f22bd
ZJS
2417 le64toh(f->header->n_objects),
2418 le64toh(f->header->n_entries));
7560fffc 2419
0284adc6 2420 if (JOURNAL_HEADER_CONTAINS(f->header, n_data))
507f22bd 2421 printf("Data Objects: %"PRIu64"\n"
0284adc6 2422 "Data Hash Table Fill: %.1f%%\n",
507f22bd 2423 le64toh(f->header->n_data),
0284adc6 2424 100.0 * (double) le64toh(f->header->n_data) / ((double) (le64toh(f->header->data_hash_table_size) / sizeof(HashItem))));
7560fffc 2425
0284adc6 2426 if (JOURNAL_HEADER_CONTAINS(f->header, n_fields))
507f22bd 2427 printf("Field Objects: %"PRIu64"\n"
0284adc6 2428 "Field Hash Table Fill: %.1f%%\n",
507f22bd 2429 le64toh(f->header->n_fields),
0284adc6 2430 100.0 * (double) le64toh(f->header->n_fields) / ((double) (le64toh(f->header->field_hash_table_size) / sizeof(HashItem))));
3223f44f
LP
2431
2432 if (JOURNAL_HEADER_CONTAINS(f->header, n_tags))
507f22bd
ZJS
2433 printf("Tag Objects: %"PRIu64"\n",
2434 le64toh(f->header->n_tags));
3223f44f 2435 if (JOURNAL_HEADER_CONTAINS(f->header, n_entry_arrays))
507f22bd
ZJS
2436 printf("Entry Array Objects: %"PRIu64"\n",
2437 le64toh(f->header->n_entry_arrays));
a1a03e30
LP
2438
2439 if (fstat(f->fd, &st) >= 0)
2440 printf("Disk usage: %s\n", format_bytes(bytes, sizeof(bytes), (off_t) st.st_blocks * 512ULL));
7560fffc
LP
2441}
2442
0284adc6
LP
2443int journal_file_open(
2444 const char *fname,
2445 int flags,
2446 mode_t mode,
2447 bool compress,
baed47c3 2448 bool seal,
0284adc6
LP
2449 JournalMetrics *metrics,
2450 MMapCache *mmap_cache,
2451 JournalFile *template,
2452 JournalFile **ret) {
7560fffc 2453
0284adc6
LP
2454 JournalFile *f;
2455 int r;
2456 bool newly_created = false;
7560fffc 2457
0284adc6 2458 assert(fname);
0559d3a5 2459 assert(ret);
7560fffc 2460
0284adc6
LP
2461 if ((flags & O_ACCMODE) != O_RDONLY &&
2462 (flags & O_ACCMODE) != O_RDWR)
2463 return -EINVAL;
7560fffc 2464
a0108012
LP
2465 if (!endswith(fname, ".journal") &&
2466 !endswith(fname, ".journal~"))
0284adc6 2467 return -EINVAL;
7560fffc 2468
0284adc6
LP
2469 f = new0(JournalFile, 1);
2470 if (!f)
2471 return -ENOMEM;
7560fffc 2472
0284adc6
LP
2473 f->fd = -1;
2474 f->mode = mode;
7560fffc 2475
0284adc6
LP
2476 f->flags = flags;
2477 f->prot = prot_from_flags(flags);
2478 f->writable = (flags & O_ACCMODE) != O_RDONLY;
92261977 2479#if defined(HAVE_LZ4)
d89c8fdf
ZJS
2480 f->compress_lz4 = compress;
2481#elif defined(HAVE_XZ)
2482 f->compress_xz = compress;
48b61739 2483#endif
49a32d43 2484#ifdef HAVE_GCRYPT
baed47c3 2485 f->seal = seal;
49a32d43 2486#endif
7560fffc 2487
0284adc6
LP
2488 if (mmap_cache)
2489 f->mmap = mmap_cache_ref(mmap_cache);
2490 else {
84168d80 2491 f->mmap = mmap_cache_new();
0284adc6
LP
2492 if (!f->mmap) {
2493 r = -ENOMEM;
2494 goto fail;
2495 }
2496 }
7560fffc 2497
0284adc6
LP
2498 f->path = strdup(fname);
2499 if (!f->path) {
2500 r = -ENOMEM;
2501 goto fail;
2502 }
7560fffc 2503
a4bcff5b
LP
2504 f->chain_cache = hashmap_new(uint64_hash_func, uint64_compare_func);
2505 if (!f->chain_cache) {
2506 r = -ENOMEM;
2507 goto fail;
2508 }
2509
0284adc6
LP
2510 f->fd = open(f->path, f->flags|O_CLOEXEC, f->mode);
2511 if (f->fd < 0) {
2512 r = -errno;
2513 goto fail;
7560fffc 2514 }
7560fffc 2515
0284adc6
LP
2516 if (fstat(f->fd, &f->last_stat) < 0) {
2517 r = -errno;
2518 goto fail;
2519 }
7560fffc 2520
0284adc6 2521 if (f->last_stat.st_size == 0 && f->writable) {
fb0951b0
LP
2522 uint64_t crtime;
2523
2524 /* Let's attach the creation time to the journal file,
2525 * so that the vacuuming code knows the age of this
2526 * file even if the file might end up corrupted one
2527 * day... Ideally we'd just use the creation time many
2528 * file systems maintain for each file, but there is
2529 * currently no usable API to query this, hence let's
2530 * emulate this via extended attributes. If extended
2531 * attributes are not supported we'll just skip this,
2532 * and rely solely on mtime/atime/ctime of the file.*/
2533
2534 crtime = htole64((uint64_t) now(CLOCK_REALTIME));
2535 fsetxattr(f->fd, "user.crtime_usec", &crtime, sizeof(crtime), XATTR_CREATE);
7560fffc 2536
feb12d3e 2537#ifdef HAVE_GCRYPT
0284adc6 2538 /* Try to load the FSPRG state, and if we can't, then
baed47c3 2539 * just don't do sealing */
49a32d43
LP
2540 if (f->seal) {
2541 r = journal_file_fss_load(f);
2542 if (r < 0)
2543 f->seal = false;
2544 }
feb12d3e 2545#endif
7560fffc 2546
0284adc6
LP
2547 r = journal_file_init_header(f, template);
2548 if (r < 0)
2549 goto fail;
7560fffc 2550
0284adc6
LP
2551 if (fstat(f->fd, &f->last_stat) < 0) {
2552 r = -errno;
2553 goto fail;
2554 }
fb0951b0
LP
2555
2556 newly_created = true;
0284adc6 2557 }
7560fffc 2558
0284adc6
LP
2559 if (f->last_stat.st_size < (off_t) HEADER_SIZE_MIN) {
2560 r = -EIO;
2561 goto fail;
2562 }
7560fffc 2563
0284adc6
LP
2564 f->header = mmap(NULL, PAGE_ALIGN(sizeof(Header)), prot_from_flags(flags), MAP_SHARED, f->fd, 0);
2565 if (f->header == MAP_FAILED) {
2566 f->header = NULL;
2567 r = -errno;
2568 goto fail;
2569 }
7560fffc 2570
0284adc6
LP
2571 if (!newly_created) {
2572 r = journal_file_verify_header(f);
2573 if (r < 0)
2574 goto fail;
2575 }
7560fffc 2576
feb12d3e 2577#ifdef HAVE_GCRYPT
0284adc6 2578 if (!newly_created && f->writable) {
baed47c3 2579 r = journal_file_fss_load(f);
0284adc6
LP
2580 if (r < 0)
2581 goto fail;
2582 }
feb12d3e 2583#endif
cec736d2
LP
2584
2585 if (f->writable) {
4a92baf3
LP
2586 if (metrics) {
2587 journal_default_metrics(metrics, f->fd);
2588 f->metrics = *metrics;
2589 } else if (template)
2590 f->metrics = template->metrics;
2591
cec736d2
LP
2592 r = journal_file_refresh_header(f);
2593 if (r < 0)
2594 goto fail;
2595 }
2596
feb12d3e 2597#ifdef HAVE_GCRYPT
baed47c3 2598 r = journal_file_hmac_setup(f);
14d10188
LP
2599 if (r < 0)
2600 goto fail;
feb12d3e 2601#endif
14d10188 2602
cec736d2 2603 if (newly_created) {
de190aef 2604 r = journal_file_setup_field_hash_table(f);
cec736d2
LP
2605 if (r < 0)
2606 goto fail;
2607
de190aef 2608 r = journal_file_setup_data_hash_table(f);
cec736d2
LP
2609 if (r < 0)
2610 goto fail;
7560fffc 2611
feb12d3e 2612#ifdef HAVE_GCRYPT
7560fffc
LP
2613 r = journal_file_append_first_tag(f);
2614 if (r < 0)
2615 goto fail;
feb12d3e 2616#endif
cec736d2
LP
2617 }
2618
de190aef 2619 r = journal_file_map_field_hash_table(f);
cec736d2
LP
2620 if (r < 0)
2621 goto fail;
2622
de190aef 2623 r = journal_file_map_data_hash_table(f);
cec736d2
LP
2624 if (r < 0)
2625 goto fail;
2626
0559d3a5 2627 *ret = f;
cec736d2
LP
2628 return 0;
2629
2630fail:
2631 journal_file_close(f);
2632
2633 return r;
2634}
0ac38b70 2635
baed47c3 2636int journal_file_rotate(JournalFile **f, bool compress, bool seal) {
57535f47 2637 _cleanup_free_ char *p = NULL;
0ac38b70
LP
2638 size_t l;
2639 JournalFile *old_file, *new_file = NULL;
2640 int r;
2641
2642 assert(f);
2643 assert(*f);
2644
2645 old_file = *f;
2646
2647 if (!old_file->writable)
2648 return -EINVAL;
2649
2650 if (!endswith(old_file->path, ".journal"))
2651 return -EINVAL;
2652
2653 l = strlen(old_file->path);
57535f47
ZJS
2654 r = asprintf(&p, "%.*s@" SD_ID128_FORMAT_STR "-%016"PRIx64"-%016"PRIx64".journal",
2655 (int) l - 8, old_file->path,
2656 SD_ID128_FORMAT_VAL(old_file->header->seqnum_id),
2657 le64toh((*f)->header->head_entry_seqnum),
2658 le64toh((*f)->header->head_entry_realtime));
2659 if (r < 0)
0ac38b70
LP
2660 return -ENOMEM;
2661
0ac38b70 2662 r = rename(old_file->path, p);
0ac38b70
LP
2663 if (r < 0)
2664 return -errno;
2665
ccdbaf91 2666 old_file->header->state = STATE_ARCHIVED;
0ac38b70 2667
baed47c3 2668 r = journal_file_open(old_file->path, old_file->flags, old_file->mode, compress, seal, NULL, old_file->mmap, old_file, &new_file);
0ac38b70
LP
2669 journal_file_close(old_file);
2670
2671 *f = new_file;
2672 return r;
2673}
2674
9447a7f1
LP
2675int journal_file_open_reliably(
2676 const char *fname,
2677 int flags,
2678 mode_t mode,
7560fffc 2679 bool compress,
baed47c3 2680 bool seal,
4a92baf3 2681 JournalMetrics *metrics,
27370278 2682 MMapCache *mmap_cache,
9447a7f1
LP
2683 JournalFile *template,
2684 JournalFile **ret) {
2685
2686 int r;
2687 size_t l;
ed375beb 2688 _cleanup_free_ char *p = NULL;
9447a7f1 2689
baed47c3 2690 r = journal_file_open(fname, flags, mode, compress, seal,
27370278 2691 metrics, mmap_cache, template, ret);
0071d9f1
LP
2692 if (r != -EBADMSG && /* corrupted */
2693 r != -ENODATA && /* truncated */
2694 r != -EHOSTDOWN && /* other machine */
a1a1898f
LP
2695 r != -EPROTONOSUPPORT && /* incompatible feature */
2696 r != -EBUSY && /* unclean shutdown */
2697 r != -ESHUTDOWN /* already archived */)
9447a7f1
LP
2698 return r;
2699
2700 if ((flags & O_ACCMODE) == O_RDONLY)
2701 return r;
2702
2703 if (!(flags & O_CREAT))
2704 return r;
2705
7560fffc
LP
2706 if (!endswith(fname, ".journal"))
2707 return r;
2708
5c70eab4
LP
2709 /* The file is corrupted. Rotate it away and try it again (but only once) */
2710
9447a7f1 2711 l = strlen(fname);
9bf3b535 2712 if (asprintf(&p, "%.*s@%016llx-%016" PRIx64 ".journal~",
57535f47 2713 (int) l - 8, fname,
9447a7f1 2714 (unsigned long long) now(CLOCK_REALTIME),
9bf3b535 2715 random_u64()) < 0)
9447a7f1
LP
2716 return -ENOMEM;
2717
2718 r = rename(fname, p);
9447a7f1
LP
2719 if (r < 0)
2720 return -errno;
2721
a1a1898f 2722 log_warning("File %s corrupted or uncleanly shut down, renaming and replacing.", fname);
9447a7f1 2723
baed47c3 2724 return journal_file_open(fname, flags, mode, compress, seal,
27370278 2725 metrics, mmap_cache, template, ret);
9447a7f1
LP
2726}
2727
cf244689
LP
2728int journal_file_copy_entry(JournalFile *from, JournalFile *to, Object *o, uint64_t p, uint64_t *seqnum, Object **ret, uint64_t *offset) {
2729 uint64_t i, n;
2730 uint64_t q, xor_hash = 0;
2731 int r;
2732 EntryItem *items;
2733 dual_timestamp ts;
2734
2735 assert(from);
2736 assert(to);
2737 assert(o);
2738 assert(p);
2739
2740 if (!to->writable)
2741 return -EPERM;
2742
2743 ts.monotonic = le64toh(o->entry.monotonic);
2744 ts.realtime = le64toh(o->entry.realtime);
2745
cf244689 2746 n = journal_file_entry_n_items(o);
4faa7004
TA
2747 /* alloca() can't take 0, hence let's allocate at least one */
2748 items = alloca(sizeof(EntryItem) * MAX(1u, n));
cf244689
LP
2749
2750 for (i = 0; i < n; i++) {
4fd052ae
FC
2751 uint64_t l, h;
2752 le64_t le_hash;
cf244689
LP
2753 size_t t;
2754 void *data;
2755 Object *u;
2756
2757 q = le64toh(o->entry.items[i].object_offset);
2758 le_hash = o->entry.items[i].hash;
2759
2760 r = journal_file_move_to_object(from, OBJECT_DATA, q, &o);
2761 if (r < 0)
2762 return r;
2763
2764 if (le_hash != o->data.hash)
2765 return -EBADMSG;
2766
2767 l = le64toh(o->object.size) - offsetof(Object, data.payload);
2768 t = (size_t) l;
2769
2770 /* We hit the limit on 32bit machines */
2771 if ((uint64_t) t != l)
2772 return -E2BIG;
2773
d89c8fdf 2774 if (o->object.flags & OBJECT_COMPRESSION_MASK) {
cf244689
LP
2775 uint64_t rsize;
2776
d89c8fdf
ZJS
2777 r = decompress_blob(o->object.flags & OBJECT_COMPRESSION_MASK,
2778 o->data.payload, l, &from->compress_buffer, &from->compress_buffer_size, &rsize, 0);
2779 if (r < 0)
2780 return r;
cf244689
LP
2781
2782 data = from->compress_buffer;
2783 l = rsize;
cf244689
LP
2784 } else
2785 data = o->data.payload;
2786
2787 r = journal_file_append_data(to, data, l, &u, &h);
2788 if (r < 0)
2789 return r;
2790
2791 xor_hash ^= le64toh(u->data.hash);
2792 items[i].object_offset = htole64(h);
2793 items[i].hash = u->data.hash;
2794
2795 r = journal_file_move_to_object(from, OBJECT_ENTRY, p, &o);
2796 if (r < 0)
2797 return r;
2798 }
2799
2800 return journal_file_append_entry_internal(to, &ts, xor_hash, items, n, seqnum, ret, offset);
2801}
babfc091
LP
2802
2803void journal_default_metrics(JournalMetrics *m, int fd) {
2804 uint64_t fs_size = 0;
2805 struct statvfs ss;
a7bc2c2a 2806 char a[FORMAT_BYTES_MAX], b[FORMAT_BYTES_MAX], c[FORMAT_BYTES_MAX], d[FORMAT_BYTES_MAX];
babfc091
LP
2807
2808 assert(m);
2809 assert(fd >= 0);
2810
2811 if (fstatvfs(fd, &ss) >= 0)
2812 fs_size = ss.f_frsize * ss.f_blocks;
2813
2814 if (m->max_use == (uint64_t) -1) {
2815
2816 if (fs_size > 0) {
2817 m->max_use = PAGE_ALIGN(fs_size / 10); /* 10% of file system size */
2818
2819 if (m->max_use > DEFAULT_MAX_USE_UPPER)
2820 m->max_use = DEFAULT_MAX_USE_UPPER;
2821
2822 if (m->max_use < DEFAULT_MAX_USE_LOWER)
2823 m->max_use = DEFAULT_MAX_USE_LOWER;
2824 } else
2825 m->max_use = DEFAULT_MAX_USE_LOWER;
2826 } else {
2827 m->max_use = PAGE_ALIGN(m->max_use);
2828
2829 if (m->max_use < JOURNAL_FILE_SIZE_MIN*2)
2830 m->max_use = JOURNAL_FILE_SIZE_MIN*2;
2831 }
2832
2833 if (m->max_size == (uint64_t) -1) {
2834 m->max_size = PAGE_ALIGN(m->max_use / 8); /* 8 chunks */
2835
2836 if (m->max_size > DEFAULT_MAX_SIZE_UPPER)
2837 m->max_size = DEFAULT_MAX_SIZE_UPPER;
2838 } else
2839 m->max_size = PAGE_ALIGN(m->max_size);
2840
2841 if (m->max_size < JOURNAL_FILE_SIZE_MIN)
2842 m->max_size = JOURNAL_FILE_SIZE_MIN;
2843
2844 if (m->max_size*2 > m->max_use)
2845 m->max_use = m->max_size*2;
2846
2847 if (m->min_size == (uint64_t) -1)
2848 m->min_size = JOURNAL_FILE_SIZE_MIN;
2849 else {
2850 m->min_size = PAGE_ALIGN(m->min_size);
2851
2852 if (m->min_size < JOURNAL_FILE_SIZE_MIN)
2853 m->min_size = JOURNAL_FILE_SIZE_MIN;
2854
2855 if (m->min_size > m->max_size)
2856 m->max_size = m->min_size;
2857 }
2858
2859 if (m->keep_free == (uint64_t) -1) {
2860
2861 if (fs_size > 0) {
8621b110 2862 m->keep_free = PAGE_ALIGN(fs_size * 3 / 20); /* 15% of file system size */
babfc091
LP
2863
2864 if (m->keep_free > DEFAULT_KEEP_FREE_UPPER)
2865 m->keep_free = DEFAULT_KEEP_FREE_UPPER;
2866
2867 } else
2868 m->keep_free = DEFAULT_KEEP_FREE;
2869 }
2870
2b43f939
LP
2871 log_debug("Fixed max_use=%s max_size=%s min_size=%s keep_free=%s",
2872 format_bytes(a, sizeof(a), m->max_use),
2873 format_bytes(b, sizeof(b), m->max_size),
2874 format_bytes(c, sizeof(c), m->min_size),
2875 format_bytes(d, sizeof(d), m->keep_free));
babfc091 2876}
08984293
LP
2877
2878int journal_file_get_cutoff_realtime_usec(JournalFile *f, usec_t *from, usec_t *to) {
08984293
LP
2879 assert(f);
2880 assert(from || to);
2881
2882 if (from) {
162566a4
LP
2883 if (f->header->head_entry_realtime == 0)
2884 return -ENOENT;
08984293 2885
162566a4 2886 *from = le64toh(f->header->head_entry_realtime);
08984293
LP
2887 }
2888
2889 if (to) {
162566a4
LP
2890 if (f->header->tail_entry_realtime == 0)
2891 return -ENOENT;
08984293 2892
162566a4 2893 *to = le64toh(f->header->tail_entry_realtime);
08984293
LP
2894 }
2895
2896 return 1;
2897}
2898
2899int journal_file_get_cutoff_monotonic_usec(JournalFile *f, sd_id128_t boot_id, usec_t *from, usec_t *to) {
08984293
LP
2900 Object *o;
2901 uint64_t p;
2902 int r;
2903
2904 assert(f);
2905 assert(from || to);
2906
47838ab3 2907 r = find_data_object_by_boot_id(f, boot_id, &o, &p);
08984293
LP
2908 if (r <= 0)
2909 return r;
2910
2911 if (le64toh(o->data.n_entries) <= 0)
2912 return 0;
2913
2914 if (from) {
2915 r = journal_file_move_to_object(f, OBJECT_ENTRY, le64toh(o->data.entry_offset), &o);
2916 if (r < 0)
2917 return r;
2918
2919 *from = le64toh(o->entry.monotonic);
2920 }
2921
2922 if (to) {
2923 r = journal_file_move_to_object(f, OBJECT_DATA, p, &o);
2924 if (r < 0)
2925 return r;
2926
2927 r = generic_array_get_plus_one(f,
2928 le64toh(o->data.entry_offset),
2929 le64toh(o->data.entry_array_offset),
2930 le64toh(o->data.n_entries)-1,
2931 &o, NULL);
2932 if (r <= 0)
2933 return r;
2934
2935 *to = le64toh(o->entry.monotonic);
2936 }
2937
2938 return 1;
2939}
dca6219e 2940
fb0951b0 2941bool journal_file_rotate_suggested(JournalFile *f, usec_t max_file_usec) {
dca6219e
LP
2942 assert(f);
2943
2944 /* If we gained new header fields we gained new features,
2945 * hence suggest a rotation */
361f9cbc
LP
2946 if (le64toh(f->header->header_size) < sizeof(Header)) {
2947 log_debug("%s uses an outdated header, suggesting rotation.", f->path);
dca6219e 2948 return true;
361f9cbc 2949 }
dca6219e
LP
2950
2951 /* Let's check if the hash tables grew over a certain fill
2952 * level (75%, borrowing this value from Java's hash table
2953 * implementation), and if so suggest a rotation. To calculate
2954 * the fill level we need the n_data field, which only exists
2955 * in newer versions. */
2956
2957 if (JOURNAL_HEADER_CONTAINS(f->header, n_data))
361f9cbc 2958 if (le64toh(f->header->n_data) * 4ULL > (le64toh(f->header->data_hash_table_size) / sizeof(HashItem)) * 3ULL) {
507f22bd 2959 log_debug("Data hash table of %s has a fill level at %.1f (%"PRIu64" of %"PRIu64" items, %llu file size, %"PRIu64" bytes per hash table item), suggesting rotation.",
361f9cbc
LP
2960 f->path,
2961 100.0 * (double) le64toh(f->header->n_data) / ((double) (le64toh(f->header->data_hash_table_size) / sizeof(HashItem))),
507f22bd
ZJS
2962 le64toh(f->header->n_data),
2963 le64toh(f->header->data_hash_table_size) / sizeof(HashItem),
2964 (unsigned long long) f->last_stat.st_size,
2965 f->last_stat.st_size / le64toh(f->header->n_data));
dca6219e 2966 return true;
361f9cbc 2967 }
dca6219e
LP
2968
2969 if (JOURNAL_HEADER_CONTAINS(f->header, n_fields))
361f9cbc 2970 if (le64toh(f->header->n_fields) * 4ULL > (le64toh(f->header->field_hash_table_size) / sizeof(HashItem)) * 3ULL) {
507f22bd 2971 log_debug("Field hash table of %s has a fill level at %.1f (%"PRIu64" of %"PRIu64" items), suggesting rotation.",
361f9cbc
LP
2972 f->path,
2973 100.0 * (double) le64toh(f->header->n_fields) / ((double) (le64toh(f->header->field_hash_table_size) / sizeof(HashItem))),
507f22bd
ZJS
2974 le64toh(f->header->n_fields),
2975 le64toh(f->header->field_hash_table_size) / sizeof(HashItem));
dca6219e 2976 return true;
361f9cbc 2977 }
dca6219e 2978
0598fd4a
LP
2979 /* Are the data objects properly indexed by field objects? */
2980 if (JOURNAL_HEADER_CONTAINS(f->header, n_data) &&
2981 JOURNAL_HEADER_CONTAINS(f->header, n_fields) &&
2982 le64toh(f->header->n_data) > 0 &&
2983 le64toh(f->header->n_fields) == 0)
2984 return true;
2985
fb0951b0
LP
2986 if (max_file_usec > 0) {
2987 usec_t t, h;
2988
2989 h = le64toh(f->header->head_entry_realtime);
2990 t = now(CLOCK_REALTIME);
2991
2992 if (h > 0 && t > h + max_file_usec)
2993 return true;
2994 }
2995
dca6219e
LP
2996 return false;
2997}