]> git.ipfire.org Git - thirdparty/systemd.git/blame - src/journal/journal-file.c
Merge pull request #1653 from keszybz/lz4-compress-time
[thirdparty/systemd.git] / src / journal / journal-file.c
CommitLineData
cec736d2
LP
1/*-*- Mode: C; c-basic-offset: 8; indent-tabs-mode: nil -*-*/
2
3/***
4 This file is part of systemd.
5
6 Copyright 2011 Lennart Poettering
7
8 systemd is free software; you can redistribute it and/or modify it
5430f7f2
LP
9 under the terms of the GNU Lesser General Public License as published by
10 the Free Software Foundation; either version 2.1 of the License, or
cec736d2
LP
11 (at your option) any later version.
12
13 systemd is distributed in the hope that it will be useful, but
14 WITHOUT ANY WARRANTY; without even the implied warranty of
15 MERCHANTABILITY or FITNESS FOR A PARTICULAR PURPOSE. See the GNU
5430f7f2 16 Lesser General Public License for more details.
cec736d2 17
5430f7f2 18 You should have received a copy of the GNU Lesser General Public License
cec736d2
LP
19 along with systemd; If not, see <http://www.gnu.org/licenses/>.
20***/
21
22#include <sys/mman.h>
23#include <errno.h>
24#include <sys/uio.h>
25#include <unistd.h>
26#include <sys/statvfs.h>
27#include <fcntl.h>
28#include <stddef.h>
11689d2a 29#include <linux/fs.h>
fb0951b0 30
f27a3864 31#include "btrfs-util.h"
cec736d2
LP
32#include "journal-def.h"
33#include "journal-file.h"
0284adc6 34#include "journal-authenticate.h"
cec736d2 35#include "lookup3.h"
807e17f0 36#include "compress.h"
3df3e884 37#include "random-util.h"
cec736d2 38
4a92baf3
LP
39#define DEFAULT_DATA_HASH_TABLE_SIZE (2047ULL*sizeof(HashItem))
40#define DEFAULT_FIELD_HASH_TABLE_SIZE (333ULL*sizeof(HashItem))
cec736d2 41
be19b7df 42#define COMPRESSION_SIZE_THRESHOLD (512ULL)
807e17f0 43
babfc091 44/* This is the minimum journal file size */
253f59df 45#define JOURNAL_FILE_SIZE_MIN (4ULL*1024ULL*1024ULL) /* 4 MiB */
babfc091
LP
46
47/* These are the lower and upper bounds if we deduce the max_use value
48 * from the file system size */
49#define DEFAULT_MAX_USE_LOWER (1ULL*1024ULL*1024ULL) /* 1 MiB */
50#define DEFAULT_MAX_USE_UPPER (4ULL*1024ULL*1024ULL*1024ULL) /* 4 GiB */
51
8580d1f7
LP
52/* This is the default minimal use limit, how much we'll use even if keep_free suggests otherwise. */
53#define DEFAULT_MIN_USE (1ULL*1024ULL*1024ULL) /* 1 MiB */
54
babfc091 55/* This is the upper bound if we deduce max_size from max_use */
71100051 56#define DEFAULT_MAX_SIZE_UPPER (128ULL*1024ULL*1024ULL) /* 128 MiB */
babfc091
LP
57
58/* This is the upper bound if we deduce the keep_free value from the
59 * file system size */
60#define DEFAULT_KEEP_FREE_UPPER (4ULL*1024ULL*1024ULL*1024ULL) /* 4 GiB */
61
62/* This is the keep_free value when we can't determine the system
63 * size */
64#define DEFAULT_KEEP_FREE (1024ULL*1024ULL) /* 1 MB */
65
8580d1f7
LP
66/* This is the default maximum number of journal files to keep around. */
67#define DEFAULT_N_MAX_FILES (100)
68
dca6219e
LP
69/* n_data was the first entry we added after the initial file format design */
70#define HEADER_SIZE_MIN ALIGN64(offsetof(Header, n_data))
cec736d2 71
a4bcff5b
LP
72/* How many entries to keep in the entry array chain cache at max */
73#define CHAIN_CACHE_MAX 20
74
a676e665
LP
75/* How much to increase the journal file size at once each time we allocate something new. */
76#define FILE_SIZE_INCREASE (8ULL*1024ULL*1024ULL) /* 8MB */
77
2678031a
LP
78/* Reread fstat() of the file for detecting deletions at least this often */
79#define LAST_STAT_REFRESH_USEC (5*USEC_PER_SEC)
80
fa6ac760
LP
81/* The mmap context to use for the header we pick as one above the last defined typed */
82#define CONTEXT_HEADER _OBJECT_TYPE_MAX
83
9588bc32 84static int journal_file_set_online(JournalFile *f) {
26687bf8
OS
85 assert(f);
86
87 if (!f->writable)
88 return -EPERM;
89
90 if (!(f->fd >= 0 && f->header))
91 return -EINVAL;
92
fa6ac760
LP
93 if (mmap_cache_got_sigbus(f->mmap, f->fd))
94 return -EIO;
95
26687bf8
OS
96 switch(f->header->state) {
97 case STATE_ONLINE:
98 return 0;
99
100 case STATE_OFFLINE:
101 f->header->state = STATE_ONLINE;
102 fsync(f->fd);
103 return 0;
104
105 default:
106 return -EINVAL;
107 }
108}
109
110int journal_file_set_offline(JournalFile *f) {
111 assert(f);
112
113 if (!f->writable)
114 return -EPERM;
115
116 if (!(f->fd >= 0 && f->header))
117 return -EINVAL;
118
119 if (f->header->state != STATE_ONLINE)
120 return 0;
121
122 fsync(f->fd);
123
fa6ac760
LP
124 if (mmap_cache_got_sigbus(f->mmap, f->fd))
125 return -EIO;
126
26687bf8
OS
127 f->header->state = STATE_OFFLINE;
128
fa6ac760
LP
129 if (mmap_cache_got_sigbus(f->mmap, f->fd))
130 return -EIO;
131
26687bf8
OS
132 fsync(f->fd);
133
134 return 0;
135}
136
804ae586 137JournalFile* journal_file_close(JournalFile *f) {
de190aef 138 assert(f);
cec736d2 139
feb12d3e 140#ifdef HAVE_GCRYPT
b0af6f41 141 /* Write the final tag */
c586dbf1 142 if (f->seal && f->writable)
b0af6f41 143 journal_file_append_tag(f);
feb12d3e 144#endif
b0af6f41 145
26687bf8 146 journal_file_set_offline(f);
cec736d2 147
fa6ac760
LP
148 if (f->mmap && f->fd >= 0)
149 mmap_cache_close_fd(f->mmap, f->fd);
cec736d2 150
11689d2a
LP
151 if (f->fd >= 0 && f->defrag_on_close) {
152
153 /* Be friendly to btrfs: turn COW back on again now,
154 * and defragment the file. We won't write to the file
155 * ever again, hence remove all fragmentation, and
156 * reenable all the good bits COW usually provides
157 * (such as data checksumming). */
158
1ed8f8c1 159 (void) chattr_fd(f->fd, 0, FS_NOCOW_FL);
11689d2a
LP
160 (void) btrfs_defrag_fd(f->fd);
161 }
f27a3864 162
03e334a1 163 safe_close(f->fd);
cec736d2 164 free(f->path);
807e17f0 165
16e9f408
LP
166 if (f->mmap)
167 mmap_cache_unref(f->mmap);
168
4743015d 169 ordered_hashmap_free_free(f->chain_cache);
a4bcff5b 170
d89c8fdf 171#if defined(HAVE_XZ) || defined(HAVE_LZ4)
807e17f0
LP
172 free(f->compress_buffer);
173#endif
174
7560fffc 175#ifdef HAVE_GCRYPT
baed47c3
LP
176 if (f->fss_file)
177 munmap(f->fss_file, PAGE_ALIGN(f->fss_file_size));
dc4ebc07 178 else
b7c9ae91
LP
179 free(f->fsprg_state);
180
181 free(f->fsprg_seed);
7560fffc
LP
182
183 if (f->hmac)
184 gcry_md_close(f->hmac);
185#endif
186
cec736d2 187 free(f);
804ae586 188 return NULL;
cec736d2
LP
189}
190
0ac38b70 191static int journal_file_init_header(JournalFile *f, JournalFile *template) {
d89c8fdf 192 Header h = {};
cec736d2
LP
193 ssize_t k;
194 int r;
195
196 assert(f);
197
7560fffc 198 memcpy(h.signature, HEADER_SIGNATURE, 8);
23b0b2b2 199 h.header_size = htole64(ALIGN64(sizeof(h)));
cec736d2 200
d89c8fdf
ZJS
201 h.incompatible_flags |= htole32(
202 f->compress_xz * HEADER_INCOMPATIBLE_COMPRESSED_XZ |
203 f->compress_lz4 * HEADER_INCOMPATIBLE_COMPRESSED_LZ4);
7560fffc 204
d89c8fdf
ZJS
205 h.compatible_flags = htole32(
206 f->seal * HEADER_COMPATIBLE_SEALED);
7560fffc 207
cec736d2
LP
208 r = sd_id128_randomize(&h.file_id);
209 if (r < 0)
210 return r;
211
0ac38b70
LP
212 if (template) {
213 h.seqnum_id = template->header->seqnum_id;
beec0085 214 h.tail_entry_seqnum = template->header->tail_entry_seqnum;
0ac38b70
LP
215 } else
216 h.seqnum_id = h.file_id;
cec736d2
LP
217
218 k = pwrite(f->fd, &h, sizeof(h), 0);
219 if (k < 0)
220 return -errno;
221
222 if (k != sizeof(h))
223 return -EIO;
224
225 return 0;
226}
227
228static int journal_file_refresh_header(JournalFile *f) {
de190aef 229 sd_id128_t boot_id;
fa6ac760 230 int r;
cec736d2
LP
231
232 assert(f);
233
234 r = sd_id128_get_machine(&f->header->machine_id);
235 if (r < 0)
236 return r;
237
de190aef 238 r = sd_id128_get_boot(&boot_id);
cec736d2
LP
239 if (r < 0)
240 return r;
241
de190aef
LP
242 if (sd_id128_equal(boot_id, f->header->boot_id))
243 f->tail_entry_monotonic_valid = true;
244
245 f->header->boot_id = boot_id;
246
fa6ac760 247 r = journal_file_set_online(f);
b788cc23 248
7560fffc 249 /* Sync the online state to disk */
a676e665 250 fsync(f->fd);
b788cc23 251
fa6ac760 252 return r;
cec736d2
LP
253}
254
255static int journal_file_verify_header(JournalFile *f) {
d89c8fdf
ZJS
256 uint32_t flags;
257
cec736d2
LP
258 assert(f);
259
7560fffc 260 if (memcmp(f->header->signature, HEADER_SIGNATURE, 8))
cec736d2
LP
261 return -EBADMSG;
262
7560fffc
LP
263 /* In both read and write mode we refuse to open files with
264 * incompatible flags we don't know */
d89c8fdf
ZJS
265 flags = le32toh(f->header->incompatible_flags);
266 if (flags & ~HEADER_INCOMPATIBLE_SUPPORTED) {
267 if (flags & ~HEADER_INCOMPATIBLE_ANY)
268 log_debug("Journal file %s has unknown incompatible flags %"PRIx32,
269 f->path, flags & ~HEADER_INCOMPATIBLE_ANY);
270 flags = (flags & HEADER_INCOMPATIBLE_ANY) & ~HEADER_INCOMPATIBLE_SUPPORTED;
271 if (flags)
272 log_debug("Journal file %s uses incompatible flags %"PRIx32
273 " disabled at compilation time.", f->path, flags);
cec736d2 274 return -EPROTONOSUPPORT;
d89c8fdf 275 }
cec736d2 276
7560fffc
LP
277 /* When open for writing we refuse to open files with
278 * compatible flags, too */
d89c8fdf
ZJS
279 flags = le32toh(f->header->compatible_flags);
280 if (f->writable && (flags & ~HEADER_COMPATIBLE_SUPPORTED)) {
281 if (flags & ~HEADER_COMPATIBLE_ANY)
282 log_debug("Journal file %s has unknown compatible flags %"PRIx32,
283 f->path, flags & ~HEADER_COMPATIBLE_ANY);
284 flags = (flags & HEADER_COMPATIBLE_ANY) & ~HEADER_COMPATIBLE_SUPPORTED;
285 if (flags)
286 log_debug("Journal file %s uses compatible flags %"PRIx32
287 " disabled at compilation time.", f->path, flags);
288 return -EPROTONOSUPPORT;
7560fffc
LP
289 }
290
db11ac1a
LP
291 if (f->header->state >= _STATE_MAX)
292 return -EBADMSG;
293
dca6219e
LP
294 /* The first addition was n_data, so check that we are at least this large */
295 if (le64toh(f->header->header_size) < HEADER_SIZE_MIN)
23b0b2b2
LP
296 return -EBADMSG;
297
8088cbd3 298 if (JOURNAL_HEADER_SEALED(f->header) && !JOURNAL_HEADER_CONTAINS(f->header, n_entry_arrays))
beec0085
LP
299 return -EBADMSG;
300
db11ac1a
LP
301 if ((le64toh(f->header->header_size) + le64toh(f->header->arena_size)) > (uint64_t) f->last_stat.st_size)
302 return -ENODATA;
303
304 if (le64toh(f->header->tail_object_offset) > (le64toh(f->header->header_size) + le64toh(f->header->arena_size)))
305 return -ENODATA;
306
7762e02b
LP
307 if (!VALID64(le64toh(f->header->data_hash_table_offset)) ||
308 !VALID64(le64toh(f->header->field_hash_table_offset)) ||
309 !VALID64(le64toh(f->header->tail_object_offset)) ||
310 !VALID64(le64toh(f->header->entry_array_offset)))
311 return -ENODATA;
312
cec736d2 313 if (f->writable) {
ccdbaf91 314 uint8_t state;
cec736d2
LP
315 sd_id128_t machine_id;
316 int r;
317
318 r = sd_id128_get_machine(&machine_id);
319 if (r < 0)
320 return r;
321
322 if (!sd_id128_equal(machine_id, f->header->machine_id))
323 return -EHOSTDOWN;
324
de190aef 325 state = f->header->state;
cec736d2 326
71fa6f00
LP
327 if (state == STATE_ONLINE) {
328 log_debug("Journal file %s is already online. Assuming unclean closing.", f->path);
329 return -EBUSY;
330 } else if (state == STATE_ARCHIVED)
cec736d2 331 return -ESHUTDOWN;
71fa6f00 332 else if (state != STATE_OFFLINE) {
8facc349 333 log_debug("Journal file %s has unknown state %i.", f->path, state);
71fa6f00
LP
334 return -EBUSY;
335 }
cec736d2
LP
336 }
337
d89c8fdf
ZJS
338 f->compress_xz = JOURNAL_HEADER_COMPRESSED_XZ(f->header);
339 f->compress_lz4 = JOURNAL_HEADER_COMPRESSED_LZ4(f->header);
c586dbf1 340
f1889c91 341 f->seal = JOURNAL_HEADER_SEALED(f->header);
7560fffc 342
cec736d2
LP
343 return 0;
344}
345
2678031a
LP
346static int journal_file_fstat(JournalFile *f) {
347 assert(f);
348 assert(f->fd >= 0);
349
350 if (fstat(f->fd, &f->last_stat) < 0)
351 return -errno;
352
353 f->last_stat_usec = now(CLOCK_MONOTONIC);
354
355 /* Refuse appending to files that are already deleted */
356 if (f->last_stat.st_nlink <= 0)
357 return -EIDRM;
358
359 return 0;
360}
361
cec736d2 362static int journal_file_allocate(JournalFile *f, uint64_t offset, uint64_t size) {
eda4b58b 363 uint64_t old_size, new_size;
fec2aa2f 364 int r;
cec736d2
LP
365
366 assert(f);
367
cec736d2 368 /* We assume that this file is not sparse, and we know that
38ac38b2 369 * for sure, since we always call posix_fallocate()
cec736d2
LP
370 * ourselves */
371
fa6ac760
LP
372 if (mmap_cache_got_sigbus(f->mmap, f->fd))
373 return -EIO;
374
cec736d2 375 old_size =
23b0b2b2 376 le64toh(f->header->header_size) +
cec736d2
LP
377 le64toh(f->header->arena_size);
378
bc85bfee 379 new_size = PAGE_ALIGN(offset + size);
23b0b2b2
LP
380 if (new_size < le64toh(f->header->header_size))
381 new_size = le64toh(f->header->header_size);
bc85bfee 382
2678031a
LP
383 if (new_size <= old_size) {
384
385 /* We already pre-allocated enough space, but before
386 * we write to it, let's check with fstat() if the
387 * file got deleted, in order make sure we don't throw
388 * away the data immediately. Don't check fstat() for
389 * all writes though, but only once ever 10s. */
390
391 if (f->last_stat_usec + LAST_STAT_REFRESH_USEC > now(CLOCK_MONOTONIC))
392 return 0;
393
394 return journal_file_fstat(f);
395 }
396
397 /* Allocate more space. */
cec736d2 398
a676e665 399 if (f->metrics.max_size > 0 && new_size > f->metrics.max_size)
bc85bfee 400 return -E2BIG;
cec736d2 401
a676e665 402 if (new_size > f->metrics.min_size && f->metrics.keep_free > 0) {
cec736d2
LP
403 struct statvfs svfs;
404
405 if (fstatvfs(f->fd, &svfs) >= 0) {
406 uint64_t available;
407
070052ab 408 available = LESS_BY((uint64_t) svfs.f_bfree * (uint64_t) svfs.f_bsize, f->metrics.keep_free);
cec736d2
LP
409
410 if (new_size - old_size > available)
411 return -E2BIG;
412 }
413 }
414
eda4b58b
LP
415 /* Increase by larger blocks at once */
416 new_size = ((new_size+FILE_SIZE_INCREASE-1) / FILE_SIZE_INCREASE) * FILE_SIZE_INCREASE;
417 if (f->metrics.max_size > 0 && new_size > f->metrics.max_size)
418 new_size = f->metrics.max_size;
419
bc85bfee
LP
420 /* Note that the glibc fallocate() fallback is very
421 inefficient, hence we try to minimize the allocation area
422 as we can. */
fec2aa2f
GV
423 r = posix_fallocate(f->fd, old_size, new_size - old_size);
424 if (r != 0)
425 return -r;
cec736d2 426
23b0b2b2 427 f->header->arena_size = htole64(new_size - le64toh(f->header->header_size));
cec736d2 428
2678031a 429 return journal_file_fstat(f);
cec736d2
LP
430}
431
78519831 432static unsigned type_to_context(ObjectType type) {
d3d3208f 433 /* One context for each type, plus one catch-all for the rest */
69adae51 434 assert_cc(_OBJECT_TYPE_MAX <= MMAP_CACHE_MAX_CONTEXTS);
fa6ac760 435 assert_cc(CONTEXT_HEADER < MMAP_CACHE_MAX_CONTEXTS);
d05089d8 436 return type > OBJECT_UNUSED && type < _OBJECT_TYPE_MAX ? type : 0;
d3d3208f
MS
437}
438
7a9dabea 439static int journal_file_move_to(JournalFile *f, ObjectType type, bool keep_always, uint64_t offset, uint64_t size, void **ret) {
2678031a
LP
440 int r;
441
cec736d2 442 assert(f);
cec736d2
LP
443 assert(ret);
444
7762e02b
LP
445 if (size <= 0)
446 return -EINVAL;
447
2a59ea54 448 /* Avoid SIGBUS on invalid accesses */
4bbdcdb3
LP
449 if (offset + size > (uint64_t) f->last_stat.st_size) {
450 /* Hmm, out of range? Let's refresh the fstat() data
451 * first, before we trust that check. */
452
2678031a
LP
453 r = journal_file_fstat(f);
454 if (r < 0)
455 return r;
456
457 if (offset + size > (uint64_t) f->last_stat.st_size)
4bbdcdb3
LP
458 return -EADDRNOTAVAIL;
459 }
460
7a9dabea 461 return mmap_cache_get(f->mmap, f->fd, f->prot, type_to_context(type), keep_always, offset, size, &f->last_stat, ret);
cec736d2
LP
462}
463
16e9f408
LP
464static uint64_t minimum_header_size(Object *o) {
465
b8e891e6 466 static const uint64_t table[] = {
16e9f408
LP
467 [OBJECT_DATA] = sizeof(DataObject),
468 [OBJECT_FIELD] = sizeof(FieldObject),
469 [OBJECT_ENTRY] = sizeof(EntryObject),
470 [OBJECT_DATA_HASH_TABLE] = sizeof(HashTableObject),
471 [OBJECT_FIELD_HASH_TABLE] = sizeof(HashTableObject),
472 [OBJECT_ENTRY_ARRAY] = sizeof(EntryArrayObject),
473 [OBJECT_TAG] = sizeof(TagObject),
474 };
475
476 if (o->object.type >= ELEMENTSOF(table) || table[o->object.type] <= 0)
477 return sizeof(ObjectHeader);
478
479 return table[o->object.type];
480}
481
78519831 482int journal_file_move_to_object(JournalFile *f, ObjectType type, uint64_t offset, Object **ret) {
cec736d2
LP
483 int r;
484 void *t;
485 Object *o;
486 uint64_t s;
487
488 assert(f);
489 assert(ret);
490
db11ac1a
LP
491 /* Objects may only be located at multiple of 64 bit */
492 if (!VALID64(offset))
493 return -EFAULT;
494
7a9dabea 495 r = journal_file_move_to(f, type, false, offset, sizeof(ObjectHeader), &t);
cec736d2
LP
496 if (r < 0)
497 return r;
498
499 o = (Object*) t;
500 s = le64toh(o->object.size);
501
502 if (s < sizeof(ObjectHeader))
503 return -EBADMSG;
504
16e9f408
LP
505 if (o->object.type <= OBJECT_UNUSED)
506 return -EBADMSG;
507
508 if (s < minimum_header_size(o))
509 return -EBADMSG;
510
d05089d8 511 if (type > OBJECT_UNUSED && o->object.type != type)
cec736d2
LP
512 return -EBADMSG;
513
514 if (s > sizeof(ObjectHeader)) {
7a9dabea 515 r = journal_file_move_to(f, type, false, offset, s, &t);
cec736d2
LP
516 if (r < 0)
517 return r;
518
519 o = (Object*) t;
520 }
521
cec736d2
LP
522 *ret = o;
523 return 0;
524}
525
d98cc1f2 526static uint64_t journal_file_entry_seqnum(JournalFile *f, uint64_t *seqnum) {
cec736d2
LP
527 uint64_t r;
528
529 assert(f);
530
beec0085 531 r = le64toh(f->header->tail_entry_seqnum) + 1;
c2373f84
LP
532
533 if (seqnum) {
de190aef 534 /* If an external seqnum counter was passed, we update
c2373f84
LP
535 * both the local and the external one, and set it to
536 * the maximum of both */
537
538 if (*seqnum + 1 > r)
539 r = *seqnum + 1;
540
541 *seqnum = r;
542 }
543
beec0085 544 f->header->tail_entry_seqnum = htole64(r);
cec736d2 545
beec0085
LP
546 if (f->header->head_entry_seqnum == 0)
547 f->header->head_entry_seqnum = htole64(r);
de190aef 548
cec736d2
LP
549 return r;
550}
551
78519831 552int journal_file_append_object(JournalFile *f, ObjectType type, uint64_t size, Object **ret, uint64_t *offset) {
cec736d2
LP
553 int r;
554 uint64_t p;
555 Object *tail, *o;
556 void *t;
557
558 assert(f);
d05089d8 559 assert(type > OBJECT_UNUSED && type < _OBJECT_TYPE_MAX);
cec736d2
LP
560 assert(size >= sizeof(ObjectHeader));
561 assert(offset);
562 assert(ret);
563
26687bf8
OS
564 r = journal_file_set_online(f);
565 if (r < 0)
566 return r;
567
cec736d2 568 p = le64toh(f->header->tail_object_offset);
cec736d2 569 if (p == 0)
23b0b2b2 570 p = le64toh(f->header->header_size);
cec736d2 571 else {
d05089d8 572 r = journal_file_move_to_object(f, OBJECT_UNUSED, p, &tail);
cec736d2
LP
573 if (r < 0)
574 return r;
575
576 p += ALIGN64(le64toh(tail->object.size));
577 }
578
579 r = journal_file_allocate(f, p, size);
580 if (r < 0)
581 return r;
582
fcde2389 583 r = journal_file_move_to(f, type, false, p, size, &t);
cec736d2
LP
584 if (r < 0)
585 return r;
586
587 o = (Object*) t;
588
589 zero(o->object);
de190aef 590 o->object.type = type;
cec736d2
LP
591 o->object.size = htole64(size);
592
593 f->header->tail_object_offset = htole64(p);
cec736d2
LP
594 f->header->n_objects = htole64(le64toh(f->header->n_objects) + 1);
595
596 *ret = o;
597 *offset = p;
598
599 return 0;
600}
601
de190aef 602static int journal_file_setup_data_hash_table(JournalFile *f) {
cec736d2
LP
603 uint64_t s, p;
604 Object *o;
605 int r;
606
607 assert(f);
608
070052ab
LP
609 /* We estimate that we need 1 hash table entry per 768 bytes
610 of journal file and we want to make sure we never get
611 beyond 75% fill level. Calculate the hash table size for
612 the maximum file size based on these metrics. */
4a92baf3 613
dfabe643 614 s = (f->metrics.max_size * 4 / 768 / 3) * sizeof(HashItem);
4a92baf3
LP
615 if (s < DEFAULT_DATA_HASH_TABLE_SIZE)
616 s = DEFAULT_DATA_HASH_TABLE_SIZE;
617
507f22bd 618 log_debug("Reserving %"PRIu64" entries in hash table.", s / sizeof(HashItem));
4a92baf3 619
de190aef
LP
620 r = journal_file_append_object(f,
621 OBJECT_DATA_HASH_TABLE,
622 offsetof(Object, hash_table.items) + s,
623 &o, &p);
cec736d2
LP
624 if (r < 0)
625 return r;
626
29804cc1 627 memzero(o->hash_table.items, s);
cec736d2 628
de190aef
LP
629 f->header->data_hash_table_offset = htole64(p + offsetof(Object, hash_table.items));
630 f->header->data_hash_table_size = htole64(s);
cec736d2
LP
631
632 return 0;
633}
634
de190aef 635static int journal_file_setup_field_hash_table(JournalFile *f) {
cec736d2
LP
636 uint64_t s, p;
637 Object *o;
638 int r;
639
640 assert(f);
641
3c1668da
LP
642 /* We use a fixed size hash table for the fields as this
643 * number should grow very slowly only */
644
de190aef
LP
645 s = DEFAULT_FIELD_HASH_TABLE_SIZE;
646 r = journal_file_append_object(f,
647 OBJECT_FIELD_HASH_TABLE,
648 offsetof(Object, hash_table.items) + s,
649 &o, &p);
cec736d2
LP
650 if (r < 0)
651 return r;
652
29804cc1 653 memzero(o->hash_table.items, s);
cec736d2 654
de190aef
LP
655 f->header->field_hash_table_offset = htole64(p + offsetof(Object, hash_table.items));
656 f->header->field_hash_table_size = htole64(s);
cec736d2
LP
657
658 return 0;
659}
660
dade37d4 661int journal_file_map_data_hash_table(JournalFile *f) {
cec736d2
LP
662 uint64_t s, p;
663 void *t;
664 int r;
665
666 assert(f);
667
dade37d4
LP
668 if (f->data_hash_table)
669 return 0;
670
de190aef
LP
671 p = le64toh(f->header->data_hash_table_offset);
672 s = le64toh(f->header->data_hash_table_size);
cec736d2 673
de190aef 674 r = journal_file_move_to(f,
16e9f408 675 OBJECT_DATA_HASH_TABLE,
fcde2389 676 true,
de190aef
LP
677 p, s,
678 &t);
cec736d2
LP
679 if (r < 0)
680 return r;
681
de190aef 682 f->data_hash_table = t;
cec736d2
LP
683 return 0;
684}
685
dade37d4 686int journal_file_map_field_hash_table(JournalFile *f) {
cec736d2
LP
687 uint64_t s, p;
688 void *t;
689 int r;
690
691 assert(f);
692
dade37d4
LP
693 if (f->field_hash_table)
694 return 0;
695
de190aef
LP
696 p = le64toh(f->header->field_hash_table_offset);
697 s = le64toh(f->header->field_hash_table_size);
cec736d2 698
de190aef 699 r = journal_file_move_to(f,
16e9f408 700 OBJECT_FIELD_HASH_TABLE,
fcde2389 701 true,
de190aef
LP
702 p, s,
703 &t);
cec736d2
LP
704 if (r < 0)
705 return r;
706
de190aef 707 f->field_hash_table = t;
cec736d2
LP
708 return 0;
709}
710
3c1668da
LP
711static int journal_file_link_field(
712 JournalFile *f,
713 Object *o,
714 uint64_t offset,
715 uint64_t hash) {
716
805d1486 717 uint64_t p, h, m;
3c1668da
LP
718 int r;
719
720 assert(f);
721 assert(o);
722 assert(offset > 0);
723
724 if (o->object.type != OBJECT_FIELD)
725 return -EINVAL;
726
805d1486
LP
727 m = le64toh(f->header->field_hash_table_size) / sizeof(HashItem);
728 if (m <= 0)
729 return -EBADMSG;
3c1668da 730
805d1486 731 /* This might alter the window we are looking at */
3c1668da
LP
732 o->field.next_hash_offset = o->field.head_data_offset = 0;
733
805d1486 734 h = hash % m;
3c1668da
LP
735 p = le64toh(f->field_hash_table[h].tail_hash_offset);
736 if (p == 0)
737 f->field_hash_table[h].head_hash_offset = htole64(offset);
738 else {
739 r = journal_file_move_to_object(f, OBJECT_FIELD, p, &o);
740 if (r < 0)
741 return r;
742
743 o->field.next_hash_offset = htole64(offset);
744 }
745
746 f->field_hash_table[h].tail_hash_offset = htole64(offset);
747
748 if (JOURNAL_HEADER_CONTAINS(f->header, n_fields))
749 f->header->n_fields = htole64(le64toh(f->header->n_fields) + 1);
750
751 return 0;
752}
753
754static int journal_file_link_data(
755 JournalFile *f,
756 Object *o,
757 uint64_t offset,
758 uint64_t hash) {
759
805d1486 760 uint64_t p, h, m;
cec736d2
LP
761 int r;
762
763 assert(f);
764 assert(o);
765 assert(offset > 0);
b588975f
LP
766
767 if (o->object.type != OBJECT_DATA)
768 return -EINVAL;
cec736d2 769
805d1486
LP
770 m = le64toh(f->header->data_hash_table_size) / sizeof(HashItem);
771 if (m <= 0)
772 return -EBADMSG;
48496df6 773
805d1486 774 /* This might alter the window we are looking at */
de190aef
LP
775 o->data.next_hash_offset = o->data.next_field_offset = 0;
776 o->data.entry_offset = o->data.entry_array_offset = 0;
777 o->data.n_entries = 0;
cec736d2 778
805d1486 779 h = hash % m;
8db4213e 780 p = le64toh(f->data_hash_table[h].tail_hash_offset);
3c1668da 781 if (p == 0)
cec736d2 782 /* Only entry in the hash table is easy */
de190aef 783 f->data_hash_table[h].head_hash_offset = htole64(offset);
3c1668da 784 else {
48496df6
LP
785 /* Move back to the previous data object, to patch in
786 * pointer */
cec736d2 787
de190aef 788 r = journal_file_move_to_object(f, OBJECT_DATA, p, &o);
cec736d2
LP
789 if (r < 0)
790 return r;
791
de190aef 792 o->data.next_hash_offset = htole64(offset);
cec736d2
LP
793 }
794
de190aef 795 f->data_hash_table[h].tail_hash_offset = htole64(offset);
cec736d2 796
dca6219e
LP
797 if (JOURNAL_HEADER_CONTAINS(f->header, n_data))
798 f->header->n_data = htole64(le64toh(f->header->n_data) + 1);
799
cec736d2
LP
800 return 0;
801}
802
3c1668da
LP
803int journal_file_find_field_object_with_hash(
804 JournalFile *f,
805 const void *field, uint64_t size, uint64_t hash,
806 Object **ret, uint64_t *offset) {
807
805d1486 808 uint64_t p, osize, h, m;
3c1668da
LP
809 int r;
810
811 assert(f);
812 assert(field && size > 0);
813
dade37d4
LP
814 /* If the field hash table is empty, we can't find anything */
815 if (le64toh(f->header->field_hash_table_size) <= 0)
816 return 0;
817
818 /* Map the field hash table, if it isn't mapped yet. */
819 r = journal_file_map_field_hash_table(f);
820 if (r < 0)
821 return r;
822
3c1668da
LP
823 osize = offsetof(Object, field.payload) + size;
824
805d1486 825 m = le64toh(f->header->field_hash_table_size) / sizeof(HashItem);
805d1486 826 if (m <= 0)
3c1668da
LP
827 return -EBADMSG;
828
805d1486 829 h = hash % m;
3c1668da
LP
830 p = le64toh(f->field_hash_table[h].head_hash_offset);
831
832 while (p > 0) {
833 Object *o;
834
835 r = journal_file_move_to_object(f, OBJECT_FIELD, p, &o);
836 if (r < 0)
837 return r;
838
839 if (le64toh(o->field.hash) == hash &&
840 le64toh(o->object.size) == osize &&
841 memcmp(o->field.payload, field, size) == 0) {
842
843 if (ret)
844 *ret = o;
845 if (offset)
846 *offset = p;
847
848 return 1;
849 }
850
851 p = le64toh(o->field.next_hash_offset);
852 }
853
854 return 0;
855}
856
857int journal_file_find_field_object(
858 JournalFile *f,
859 const void *field, uint64_t size,
860 Object **ret, uint64_t *offset) {
861
862 uint64_t hash;
863
864 assert(f);
865 assert(field && size > 0);
866
867 hash = hash64(field, size);
868
869 return journal_file_find_field_object_with_hash(f,
870 field, size, hash,
871 ret, offset);
872}
873
de190aef
LP
874int journal_file_find_data_object_with_hash(
875 JournalFile *f,
876 const void *data, uint64_t size, uint64_t hash,
877 Object **ret, uint64_t *offset) {
48496df6 878
805d1486 879 uint64_t p, osize, h, m;
cec736d2
LP
880 int r;
881
882 assert(f);
883 assert(data || size == 0);
884
dade37d4
LP
885 /* If there's no data hash table, then there's no entry. */
886 if (le64toh(f->header->data_hash_table_size) <= 0)
887 return 0;
888
889 /* Map the data hash table, if it isn't mapped yet. */
890 r = journal_file_map_data_hash_table(f);
891 if (r < 0)
892 return r;
893
cec736d2
LP
894 osize = offsetof(Object, data.payload) + size;
895
805d1486
LP
896 m = le64toh(f->header->data_hash_table_size) / sizeof(HashItem);
897 if (m <= 0)
bc85bfee
LP
898 return -EBADMSG;
899
805d1486 900 h = hash % m;
de190aef 901 p = le64toh(f->data_hash_table[h].head_hash_offset);
cec736d2 902
de190aef
LP
903 while (p > 0) {
904 Object *o;
cec736d2 905
de190aef 906 r = journal_file_move_to_object(f, OBJECT_DATA, p, &o);
cec736d2
LP
907 if (r < 0)
908 return r;
909
807e17f0 910 if (le64toh(o->data.hash) != hash)
85a131e8 911 goto next;
807e17f0 912
d89c8fdf 913 if (o->object.flags & OBJECT_COMPRESSION_MASK) {
3b1a55e1 914#if defined(HAVE_XZ) || defined(HAVE_LZ4)
fa1c4b51 915 uint64_t l;
a7f7d1bd 916 size_t rsize = 0;
cec736d2 917
807e17f0
LP
918 l = le64toh(o->object.size);
919 if (l <= offsetof(Object, data.payload))
cec736d2
LP
920 return -EBADMSG;
921
807e17f0
LP
922 l -= offsetof(Object, data.payload);
923
d89c8fdf
ZJS
924 r = decompress_blob(o->object.flags & OBJECT_COMPRESSION_MASK,
925 o->data.payload, l, &f->compress_buffer, &f->compress_buffer_size, &rsize, 0);
926 if (r < 0)
927 return r;
807e17f0 928
b785c858 929 if (rsize == size &&
807e17f0
LP
930 memcmp(f->compress_buffer, data, size) == 0) {
931
932 if (ret)
933 *ret = o;
934
935 if (offset)
936 *offset = p;
937
938 return 1;
939 }
3b1a55e1
ZJS
940#else
941 return -EPROTONOSUPPORT;
942#endif
807e17f0
LP
943 } else if (le64toh(o->object.size) == osize &&
944 memcmp(o->data.payload, data, size) == 0) {
945
cec736d2
LP
946 if (ret)
947 *ret = o;
948
949 if (offset)
950 *offset = p;
951
de190aef 952 return 1;
cec736d2
LP
953 }
954
85a131e8 955 next:
cec736d2
LP
956 p = le64toh(o->data.next_hash_offset);
957 }
958
de190aef
LP
959 return 0;
960}
961
962int journal_file_find_data_object(
963 JournalFile *f,
964 const void *data, uint64_t size,
965 Object **ret, uint64_t *offset) {
966
967 uint64_t hash;
968
969 assert(f);
970 assert(data || size == 0);
971
972 hash = hash64(data, size);
973
974 return journal_file_find_data_object_with_hash(f,
975 data, size, hash,
976 ret, offset);
977}
978
3c1668da
LP
979static int journal_file_append_field(
980 JournalFile *f,
981 const void *field, uint64_t size,
982 Object **ret, uint64_t *offset) {
983
984 uint64_t hash, p;
985 uint64_t osize;
986 Object *o;
987 int r;
988
989 assert(f);
990 assert(field && size > 0);
991
992 hash = hash64(field, size);
993
994 r = journal_file_find_field_object_with_hash(f, field, size, hash, &o, &p);
995 if (r < 0)
996 return r;
997 else if (r > 0) {
998
999 if (ret)
1000 *ret = o;
1001
1002 if (offset)
1003 *offset = p;
1004
1005 return 0;
1006 }
1007
1008 osize = offsetof(Object, field.payload) + size;
1009 r = journal_file_append_object(f, OBJECT_FIELD, osize, &o, &p);
8c92d4bb
LP
1010 if (r < 0)
1011 return r;
3c1668da
LP
1012
1013 o->field.hash = htole64(hash);
1014 memcpy(o->field.payload, field, size);
1015
1016 r = journal_file_link_field(f, o, p, hash);
1017 if (r < 0)
1018 return r;
1019
1020 /* The linking might have altered the window, so let's
1021 * refresh our pointer */
1022 r = journal_file_move_to_object(f, OBJECT_FIELD, p, &o);
1023 if (r < 0)
1024 return r;
1025
1026#ifdef HAVE_GCRYPT
1027 r = journal_file_hmac_put_object(f, OBJECT_FIELD, o, p);
1028 if (r < 0)
1029 return r;
1030#endif
1031
1032 if (ret)
1033 *ret = o;
1034
1035 if (offset)
1036 *offset = p;
1037
1038 return 0;
1039}
1040
48496df6
LP
1041static int journal_file_append_data(
1042 JournalFile *f,
1043 const void *data, uint64_t size,
1044 Object **ret, uint64_t *offset) {
1045
de190aef
LP
1046 uint64_t hash, p;
1047 uint64_t osize;
1048 Object *o;
d89c8fdf 1049 int r, compression = 0;
3c1668da 1050 const void *eq;
de190aef
LP
1051
1052 assert(f);
1053 assert(data || size == 0);
1054
1055 hash = hash64(data, size);
1056
1057 r = journal_file_find_data_object_with_hash(f, data, size, hash, &o, &p);
1058 if (r < 0)
1059 return r;
1060 else if (r > 0) {
1061
1062 if (ret)
1063 *ret = o;
1064
1065 if (offset)
1066 *offset = p;
1067
1068 return 0;
1069 }
1070
1071 osize = offsetof(Object, data.payload) + size;
1072 r = journal_file_append_object(f, OBJECT_DATA, osize, &o, &p);
cec736d2
LP
1073 if (r < 0)
1074 return r;
1075
cec736d2 1076 o->data.hash = htole64(hash);
807e17f0 1077
d89c8fdf
ZJS
1078#if defined(HAVE_XZ) || defined(HAVE_LZ4)
1079 if (f->compress_xz &&
807e17f0 1080 size >= COMPRESSION_SIZE_THRESHOLD) {
a7f7d1bd 1081 size_t rsize = 0;
807e17f0 1082
d89c8fdf 1083 compression = compress_blob(data, size, o->data.payload, &rsize);
807e17f0 1084
d89c8fdf 1085 if (compression) {
807e17f0 1086 o->object.size = htole64(offsetof(Object, data.payload) + rsize);
d89c8fdf 1087 o->object.flags |= compression;
807e17f0 1088
fa1c4b51 1089 log_debug("Compressed data object %"PRIu64" -> %zu using %s",
d89c8fdf 1090 size, rsize, object_compressed_to_string(compression));
807e17f0
LP
1091 }
1092 }
1093#endif
1094
d89c8fdf 1095 if (!compression && size > 0)
807e17f0 1096 memcpy(o->data.payload, data, size);
cec736d2 1097
de190aef 1098 r = journal_file_link_data(f, o, p, hash);
cec736d2
LP
1099 if (r < 0)
1100 return r;
1101
48496df6
LP
1102 /* The linking might have altered the window, so let's
1103 * refresh our pointer */
1104 r = journal_file_move_to_object(f, OBJECT_DATA, p, &o);
1105 if (r < 0)
1106 return r;
1107
08c6f819
SL
1108 if (!data)
1109 eq = NULL;
1110 else
1111 eq = memchr(data, '=', size);
3c1668da 1112 if (eq && eq > data) {
748db592 1113 Object *fo = NULL;
3c1668da 1114 uint64_t fp;
3c1668da
LP
1115
1116 /* Create field object ... */
1117 r = journal_file_append_field(f, data, (uint8_t*) eq - (uint8_t*) data, &fo, &fp);
1118 if (r < 0)
1119 return r;
1120
1121 /* ... and link it in. */
1122 o->data.next_field_offset = fo->field.head_data_offset;
1123 fo->field.head_data_offset = le64toh(p);
1124 }
1125
5996c7c2
LP
1126#ifdef HAVE_GCRYPT
1127 r = journal_file_hmac_put_object(f, OBJECT_DATA, o, p);
1128 if (r < 0)
1129 return r;
1130#endif
1131
cec736d2
LP
1132 if (ret)
1133 *ret = o;
1134
1135 if (offset)
de190aef 1136 *offset = p;
cec736d2
LP
1137
1138 return 0;
1139}
1140
1141uint64_t journal_file_entry_n_items(Object *o) {
1142 assert(o);
b588975f
LP
1143
1144 if (o->object.type != OBJECT_ENTRY)
1145 return 0;
cec736d2
LP
1146
1147 return (le64toh(o->object.size) - offsetof(Object, entry.items)) / sizeof(EntryItem);
1148}
1149
0284adc6 1150uint64_t journal_file_entry_array_n_items(Object *o) {
de190aef 1151 assert(o);
b588975f
LP
1152
1153 if (o->object.type != OBJECT_ENTRY_ARRAY)
1154 return 0;
de190aef
LP
1155
1156 return (le64toh(o->object.size) - offsetof(Object, entry_array.items)) / sizeof(uint64_t);
1157}
1158
fb9a24b6
LP
1159uint64_t journal_file_hash_table_n_items(Object *o) {
1160 assert(o);
b588975f
LP
1161
1162 if (o->object.type != OBJECT_DATA_HASH_TABLE &&
1163 o->object.type != OBJECT_FIELD_HASH_TABLE)
1164 return 0;
fb9a24b6
LP
1165
1166 return (le64toh(o->object.size) - offsetof(Object, hash_table.items)) / sizeof(HashItem);
1167}
1168
de190aef 1169static int link_entry_into_array(JournalFile *f,
4fd052ae
FC
1170 le64_t *first,
1171 le64_t *idx,
de190aef 1172 uint64_t p) {
cec736d2 1173 int r;
de190aef
LP
1174 uint64_t n = 0, ap = 0, q, i, a, hidx;
1175 Object *o;
1176
cec736d2 1177 assert(f);
de190aef
LP
1178 assert(first);
1179 assert(idx);
1180 assert(p > 0);
cec736d2 1181
de190aef
LP
1182 a = le64toh(*first);
1183 i = hidx = le64toh(*idx);
1184 while (a > 0) {
1185
1186 r = journal_file_move_to_object(f, OBJECT_ENTRY_ARRAY, a, &o);
1187 if (r < 0)
1188 return r;
cec736d2 1189
de190aef
LP
1190 n = journal_file_entry_array_n_items(o);
1191 if (i < n) {
1192 o->entry_array.items[i] = htole64(p);
1193 *idx = htole64(hidx + 1);
1194 return 0;
1195 }
cec736d2 1196
de190aef
LP
1197 i -= n;
1198 ap = a;
1199 a = le64toh(o->entry_array.next_entry_array_offset);
1200 }
1201
1202 if (hidx > n)
1203 n = (hidx+1) * 2;
1204 else
1205 n = n * 2;
1206
1207 if (n < 4)
1208 n = 4;
1209
1210 r = journal_file_append_object(f, OBJECT_ENTRY_ARRAY,
1211 offsetof(Object, entry_array.items) + n * sizeof(uint64_t),
1212 &o, &q);
cec736d2
LP
1213 if (r < 0)
1214 return r;
1215
feb12d3e 1216#ifdef HAVE_GCRYPT
5996c7c2 1217 r = journal_file_hmac_put_object(f, OBJECT_ENTRY_ARRAY, o, q);
b0af6f41
LP
1218 if (r < 0)
1219 return r;
feb12d3e 1220#endif
b0af6f41 1221
de190aef 1222 o->entry_array.items[i] = htole64(p);
cec736d2 1223
de190aef 1224 if (ap == 0)
7be3aa17 1225 *first = htole64(q);
cec736d2 1226 else {
de190aef 1227 r = journal_file_move_to_object(f, OBJECT_ENTRY_ARRAY, ap, &o);
cec736d2
LP
1228 if (r < 0)
1229 return r;
1230
de190aef
LP
1231 o->entry_array.next_entry_array_offset = htole64(q);
1232 }
cec736d2 1233
2dee23eb
LP
1234 if (JOURNAL_HEADER_CONTAINS(f->header, n_entry_arrays))
1235 f->header->n_entry_arrays = htole64(le64toh(f->header->n_entry_arrays) + 1);
1236
de190aef
LP
1237 *idx = htole64(hidx + 1);
1238
1239 return 0;
1240}
cec736d2 1241
de190aef 1242static int link_entry_into_array_plus_one(JournalFile *f,
4fd052ae
FC
1243 le64_t *extra,
1244 le64_t *first,
1245 le64_t *idx,
de190aef
LP
1246 uint64_t p) {
1247
1248 int r;
1249
1250 assert(f);
1251 assert(extra);
1252 assert(first);
1253 assert(idx);
1254 assert(p > 0);
1255
1256 if (*idx == 0)
1257 *extra = htole64(p);
1258 else {
4fd052ae 1259 le64_t i;
de190aef 1260
7be3aa17 1261 i = htole64(le64toh(*idx) - 1);
de190aef
LP
1262 r = link_entry_into_array(f, first, &i, p);
1263 if (r < 0)
1264 return r;
cec736d2
LP
1265 }
1266
de190aef
LP
1267 *idx = htole64(le64toh(*idx) + 1);
1268 return 0;
1269}
1270
1271static int journal_file_link_entry_item(JournalFile *f, Object *o, uint64_t offset, uint64_t i) {
1272 uint64_t p;
1273 int r;
1274 assert(f);
1275 assert(o);
1276 assert(offset > 0);
1277
1278 p = le64toh(o->entry.items[i].object_offset);
1279 if (p == 0)
1280 return -EINVAL;
1281
1282 r = journal_file_move_to_object(f, OBJECT_DATA, p, &o);
cec736d2
LP
1283 if (r < 0)
1284 return r;
1285
de190aef
LP
1286 return link_entry_into_array_plus_one(f,
1287 &o->data.entry_offset,
1288 &o->data.entry_array_offset,
1289 &o->data.n_entries,
1290 offset);
cec736d2
LP
1291}
1292
1293static int journal_file_link_entry(JournalFile *f, Object *o, uint64_t offset) {
de190aef 1294 uint64_t n, i;
cec736d2
LP
1295 int r;
1296
1297 assert(f);
1298 assert(o);
1299 assert(offset > 0);
b588975f
LP
1300
1301 if (o->object.type != OBJECT_ENTRY)
1302 return -EINVAL;
cec736d2 1303
b788cc23
LP
1304 __sync_synchronize();
1305
cec736d2 1306 /* Link up the entry itself */
de190aef
LP
1307 r = link_entry_into_array(f,
1308 &f->header->entry_array_offset,
1309 &f->header->n_entries,
1310 offset);
1311 if (r < 0)
1312 return r;
cec736d2 1313
507f22bd 1314 /* log_debug("=> %s seqnr=%"PRIu64" n_entries=%"PRIu64, f->path, o->entry.seqnum, f->header->n_entries); */
cec736d2 1315
de190aef 1316 if (f->header->head_entry_realtime == 0)
0ac38b70 1317 f->header->head_entry_realtime = o->entry.realtime;
cec736d2 1318
0ac38b70 1319 f->header->tail_entry_realtime = o->entry.realtime;
de190aef
LP
1320 f->header->tail_entry_monotonic = o->entry.monotonic;
1321
1322 f->tail_entry_monotonic_valid = true;
cec736d2
LP
1323
1324 /* Link up the items */
1325 n = journal_file_entry_n_items(o);
1326 for (i = 0; i < n; i++) {
1327 r = journal_file_link_entry_item(f, o, offset, i);
1328 if (r < 0)
1329 return r;
1330 }
1331
cec736d2
LP
1332 return 0;
1333}
1334
1335static int journal_file_append_entry_internal(
1336 JournalFile *f,
1337 const dual_timestamp *ts,
1338 uint64_t xor_hash,
1339 const EntryItem items[], unsigned n_items,
de190aef 1340 uint64_t *seqnum,
cec736d2
LP
1341 Object **ret, uint64_t *offset) {
1342 uint64_t np;
1343 uint64_t osize;
1344 Object *o;
1345 int r;
1346
1347 assert(f);
1348 assert(items || n_items == 0);
de190aef 1349 assert(ts);
cec736d2
LP
1350
1351 osize = offsetof(Object, entry.items) + (n_items * sizeof(EntryItem));
1352
de190aef 1353 r = journal_file_append_object(f, OBJECT_ENTRY, osize, &o, &np);
cec736d2
LP
1354 if (r < 0)
1355 return r;
1356
d98cc1f2 1357 o->entry.seqnum = htole64(journal_file_entry_seqnum(f, seqnum));
cec736d2 1358 memcpy(o->entry.items, items, n_items * sizeof(EntryItem));
de190aef
LP
1359 o->entry.realtime = htole64(ts->realtime);
1360 o->entry.monotonic = htole64(ts->monotonic);
cec736d2
LP
1361 o->entry.xor_hash = htole64(xor_hash);
1362 o->entry.boot_id = f->header->boot_id;
1363
feb12d3e 1364#ifdef HAVE_GCRYPT
5996c7c2 1365 r = journal_file_hmac_put_object(f, OBJECT_ENTRY, o, np);
b0af6f41
LP
1366 if (r < 0)
1367 return r;
feb12d3e 1368#endif
b0af6f41 1369
cec736d2
LP
1370 r = journal_file_link_entry(f, o, np);
1371 if (r < 0)
1372 return r;
1373
1374 if (ret)
1375 *ret = o;
1376
1377 if (offset)
1378 *offset = np;
1379
1380 return 0;
1381}
1382
cf244689 1383void journal_file_post_change(JournalFile *f) {
50f20cfd
LP
1384 assert(f);
1385
1386 /* inotify() does not receive IN_MODIFY events from file
1387 * accesses done via mmap(). After each access we hence
1388 * trigger IN_MODIFY by truncating the journal file to its
1389 * current size which triggers IN_MODIFY. */
1390
bc85bfee
LP
1391 __sync_synchronize();
1392
50f20cfd 1393 if (ftruncate(f->fd, f->last_stat.st_size) < 0)
56f64d95 1394 log_error_errno(errno, "Failed to truncate file to its own size: %m");
50f20cfd
LP
1395}
1396
1f2da9ec
LP
1397static int entry_item_cmp(const void *_a, const void *_b) {
1398 const EntryItem *a = _a, *b = _b;
1399
1400 if (le64toh(a->object_offset) < le64toh(b->object_offset))
1401 return -1;
1402 if (le64toh(a->object_offset) > le64toh(b->object_offset))
1403 return 1;
1404 return 0;
1405}
1406
de190aef 1407int journal_file_append_entry(JournalFile *f, const dual_timestamp *ts, const struct iovec iovec[], unsigned n_iovec, uint64_t *seqnum, Object **ret, uint64_t *offset) {
cec736d2
LP
1408 unsigned i;
1409 EntryItem *items;
1410 int r;
1411 uint64_t xor_hash = 0;
de190aef 1412 struct dual_timestamp _ts;
cec736d2
LP
1413
1414 assert(f);
1415 assert(iovec || n_iovec == 0);
1416
de190aef
LP
1417 if (!ts) {
1418 dual_timestamp_get(&_ts);
1419 ts = &_ts;
1420 }
1421
1422 if (f->tail_entry_monotonic_valid &&
1423 ts->monotonic < le64toh(f->header->tail_entry_monotonic))
1424 return -EINVAL;
1425
feb12d3e 1426#ifdef HAVE_GCRYPT
7560fffc
LP
1427 r = journal_file_maybe_append_tag(f, ts->realtime);
1428 if (r < 0)
1429 return r;
feb12d3e 1430#endif
7560fffc 1431
64825d3c 1432 /* alloca() can't take 0, hence let's allocate at least one */
9607d947 1433 items = alloca(sizeof(EntryItem) * MAX(1u, n_iovec));
cec736d2
LP
1434
1435 for (i = 0; i < n_iovec; i++) {
1436 uint64_t p;
1437 Object *o;
1438
1439 r = journal_file_append_data(f, iovec[i].iov_base, iovec[i].iov_len, &o, &p);
1440 if (r < 0)
cf244689 1441 return r;
cec736d2
LP
1442
1443 xor_hash ^= le64toh(o->data.hash);
1444 items[i].object_offset = htole64(p);
de7b95cd 1445 items[i].hash = o->data.hash;
cec736d2
LP
1446 }
1447
1f2da9ec
LP
1448 /* Order by the position on disk, in order to improve seek
1449 * times for rotating media. */
7ff7394d 1450 qsort_safe(items, n_iovec, sizeof(EntryItem), entry_item_cmp);
1f2da9ec 1451
de190aef 1452 r = journal_file_append_entry_internal(f, ts, xor_hash, items, n_iovec, seqnum, ret, offset);
cec736d2 1453
fa6ac760
LP
1454 /* If the memory mapping triggered a SIGBUS then we return an
1455 * IO error and ignore the error code passed down to us, since
1456 * it is very likely just an effect of a nullified replacement
1457 * mapping page */
1458
1459 if (mmap_cache_got_sigbus(f->mmap, f->fd))
1460 r = -EIO;
1461
50f20cfd
LP
1462 journal_file_post_change(f);
1463
cec736d2
LP
1464 return r;
1465}
1466
a4bcff5b 1467typedef struct ChainCacheItem {
fb099c8d 1468 uint64_t first; /* the array at the beginning of the chain */
a4bcff5b
LP
1469 uint64_t array; /* the cached array */
1470 uint64_t begin; /* the first item in the cached array */
1471 uint64_t total; /* the total number of items in all arrays before this one in the chain */
f268980d 1472 uint64_t last_index; /* the last index we looked at, to optimize locality when bisecting */
a4bcff5b
LP
1473} ChainCacheItem;
1474
1475static void chain_cache_put(
4743015d 1476 OrderedHashmap *h,
a4bcff5b
LP
1477 ChainCacheItem *ci,
1478 uint64_t first,
1479 uint64_t array,
1480 uint64_t begin,
f268980d
LP
1481 uint64_t total,
1482 uint64_t last_index) {
a4bcff5b
LP
1483
1484 if (!ci) {
34741aa3
LP
1485 /* If the chain item to cache for this chain is the
1486 * first one it's not worth caching anything */
1487 if (array == first)
1488 return;
1489
29433089 1490 if (ordered_hashmap_size(h) >= CHAIN_CACHE_MAX) {
4743015d 1491 ci = ordered_hashmap_steal_first(h);
29433089
LP
1492 assert(ci);
1493 } else {
a4bcff5b
LP
1494 ci = new(ChainCacheItem, 1);
1495 if (!ci)
1496 return;
1497 }
1498
1499 ci->first = first;
1500
4743015d 1501 if (ordered_hashmap_put(h, &ci->first, ci) < 0) {
a4bcff5b
LP
1502 free(ci);
1503 return;
1504 }
1505 } else
1506 assert(ci->first == first);
1507
1508 ci->array = array;
1509 ci->begin = begin;
1510 ci->total = total;
f268980d 1511 ci->last_index = last_index;
a4bcff5b
LP
1512}
1513
f268980d
LP
1514static int generic_array_get(
1515 JournalFile *f,
1516 uint64_t first,
1517 uint64_t i,
1518 Object **ret, uint64_t *offset) {
de190aef 1519
cec736d2 1520 Object *o;
a4bcff5b 1521 uint64_t p = 0, a, t = 0;
cec736d2 1522 int r;
a4bcff5b 1523 ChainCacheItem *ci;
cec736d2
LP
1524
1525 assert(f);
1526
de190aef 1527 a = first;
a4bcff5b
LP
1528
1529 /* Try the chain cache first */
4743015d 1530 ci = ordered_hashmap_get(f->chain_cache, &first);
a4bcff5b
LP
1531 if (ci && i > ci->total) {
1532 a = ci->array;
1533 i -= ci->total;
1534 t = ci->total;
1535 }
1536
de190aef 1537 while (a > 0) {
a4bcff5b 1538 uint64_t k;
cec736d2 1539
de190aef
LP
1540 r = journal_file_move_to_object(f, OBJECT_ENTRY_ARRAY, a, &o);
1541 if (r < 0)
1542 return r;
cec736d2 1543
a4bcff5b
LP
1544 k = journal_file_entry_array_n_items(o);
1545 if (i < k) {
de190aef 1546 p = le64toh(o->entry_array.items[i]);
a4bcff5b 1547 goto found;
cec736d2
LP
1548 }
1549
a4bcff5b
LP
1550 i -= k;
1551 t += k;
de190aef
LP
1552 a = le64toh(o->entry_array.next_entry_array_offset);
1553 }
1554
a4bcff5b
LP
1555 return 0;
1556
1557found:
1558 /* Let's cache this item for the next invocation */
af13a6b0 1559 chain_cache_put(f->chain_cache, ci, first, a, le64toh(o->entry_array.items[0]), t, i);
de190aef
LP
1560
1561 r = journal_file_move_to_object(f, OBJECT_ENTRY, p, &o);
1562 if (r < 0)
1563 return r;
1564
1565 if (ret)
1566 *ret = o;
1567
1568 if (offset)
1569 *offset = p;
1570
1571 return 1;
1572}
1573
f268980d
LP
1574static int generic_array_get_plus_one(
1575 JournalFile *f,
1576 uint64_t extra,
1577 uint64_t first,
1578 uint64_t i,
1579 Object **ret, uint64_t *offset) {
de190aef
LP
1580
1581 Object *o;
1582
1583 assert(f);
1584
1585 if (i == 0) {
1586 int r;
1587
1588 r = journal_file_move_to_object(f, OBJECT_ENTRY, extra, &o);
cec736d2
LP
1589 if (r < 0)
1590 return r;
1591
de190aef
LP
1592 if (ret)
1593 *ret = o;
cec736d2 1594
de190aef
LP
1595 if (offset)
1596 *offset = extra;
cec736d2 1597
de190aef 1598 return 1;
cec736d2
LP
1599 }
1600
de190aef
LP
1601 return generic_array_get(f, first, i-1, ret, offset);
1602}
cec736d2 1603
de190aef
LP
1604enum {
1605 TEST_FOUND,
1606 TEST_LEFT,
1607 TEST_RIGHT
1608};
cec736d2 1609
f268980d
LP
1610static int generic_array_bisect(
1611 JournalFile *f,
1612 uint64_t first,
1613 uint64_t n,
1614 uint64_t needle,
1615 int (*test_object)(JournalFile *f, uint64_t p, uint64_t needle),
1616 direction_t direction,
1617 Object **ret,
1618 uint64_t *offset,
1619 uint64_t *idx) {
1620
1621 uint64_t a, p, t = 0, i = 0, last_p = 0, last_index = (uint64_t) -1;
de190aef
LP
1622 bool subtract_one = false;
1623 Object *o, *array = NULL;
1624 int r;
a4bcff5b 1625 ChainCacheItem *ci;
cec736d2 1626
de190aef
LP
1627 assert(f);
1628 assert(test_object);
cec736d2 1629
a4bcff5b 1630 /* Start with the first array in the chain */
de190aef 1631 a = first;
a4bcff5b 1632
4743015d 1633 ci = ordered_hashmap_get(f->chain_cache, &first);
a4bcff5b
LP
1634 if (ci && n > ci->total) {
1635 /* Ah, we have iterated this bisection array chain
1636 * previously! Let's see if we can skip ahead in the
1637 * chain, as far as the last time. But we can't jump
1638 * backwards in the chain, so let's check that
1639 * first. */
1640
1641 r = test_object(f, ci->begin, needle);
1642 if (r < 0)
1643 return r;
1644
1645 if (r == TEST_LEFT) {
f268980d 1646 /* OK, what we are looking for is right of the
a4bcff5b
LP
1647 * begin of this EntryArray, so let's jump
1648 * straight to previously cached array in the
1649 * chain */
1650
1651 a = ci->array;
1652 n -= ci->total;
1653 t = ci->total;
f268980d 1654 last_index = ci->last_index;
a4bcff5b
LP
1655 }
1656 }
1657
de190aef
LP
1658 while (a > 0) {
1659 uint64_t left, right, k, lp;
1660
1661 r = journal_file_move_to_object(f, OBJECT_ENTRY_ARRAY, a, &array);
cec736d2
LP
1662 if (r < 0)
1663 return r;
1664
de190aef
LP
1665 k = journal_file_entry_array_n_items(array);
1666 right = MIN(k, n);
1667 if (right <= 0)
1668 return 0;
cec736d2 1669
de190aef
LP
1670 i = right - 1;
1671 lp = p = le64toh(array->entry_array.items[i]);
1672 if (p <= 0)
1673 return -EBADMSG;
cec736d2 1674
de190aef
LP
1675 r = test_object(f, p, needle);
1676 if (r < 0)
1677 return r;
cec736d2 1678
de190aef
LP
1679 if (r == TEST_FOUND)
1680 r = direction == DIRECTION_DOWN ? TEST_RIGHT : TEST_LEFT;
1681
1682 if (r == TEST_RIGHT) {
1683 left = 0;
1684 right -= 1;
f268980d
LP
1685
1686 if (last_index != (uint64_t) -1) {
1687 assert(last_index <= right);
1688
1689 /* If we cached the last index we
1690 * looked at, let's try to not to jump
1691 * too wildly around and see if we can
1692 * limit the range to look at early to
1693 * the immediate neighbors of the last
1694 * index we looked at. */
1695
1696 if (last_index > 0) {
1697 uint64_t x = last_index - 1;
1698
1699 p = le64toh(array->entry_array.items[x]);
1700 if (p <= 0)
1701 return -EBADMSG;
1702
1703 r = test_object(f, p, needle);
1704 if (r < 0)
1705 return r;
1706
1707 if (r == TEST_FOUND)
1708 r = direction == DIRECTION_DOWN ? TEST_RIGHT : TEST_LEFT;
1709
1710 if (r == TEST_RIGHT)
1711 right = x;
1712 else
1713 left = x + 1;
1714 }
1715
1716 if (last_index < right) {
1717 uint64_t y = last_index + 1;
1718
1719 p = le64toh(array->entry_array.items[y]);
1720 if (p <= 0)
1721 return -EBADMSG;
1722
1723 r = test_object(f, p, needle);
1724 if (r < 0)
1725 return r;
1726
1727 if (r == TEST_FOUND)
1728 r = direction == DIRECTION_DOWN ? TEST_RIGHT : TEST_LEFT;
1729
1730 if (r == TEST_RIGHT)
1731 right = y;
1732 else
1733 left = y + 1;
1734 }
f268980d
LP
1735 }
1736
de190aef
LP
1737 for (;;) {
1738 if (left == right) {
1739 if (direction == DIRECTION_UP)
1740 subtract_one = true;
1741
1742 i = left;
1743 goto found;
1744 }
1745
1746 assert(left < right);
de190aef 1747 i = (left + right) / 2;
f268980d 1748
de190aef
LP
1749 p = le64toh(array->entry_array.items[i]);
1750 if (p <= 0)
1751 return -EBADMSG;
1752
1753 r = test_object(f, p, needle);
1754 if (r < 0)
1755 return r;
cec736d2 1756
de190aef
LP
1757 if (r == TEST_FOUND)
1758 r = direction == DIRECTION_DOWN ? TEST_RIGHT : TEST_LEFT;
1759
1760 if (r == TEST_RIGHT)
1761 right = i;
1762 else
1763 left = i + 1;
1764 }
1765 }
1766
2173cbf8 1767 if (k >= n) {
cbdca852
LP
1768 if (direction == DIRECTION_UP) {
1769 i = n;
1770 subtract_one = true;
1771 goto found;
1772 }
1773
cec736d2 1774 return 0;
cbdca852 1775 }
cec736d2 1776
de190aef
LP
1777 last_p = lp;
1778
1779 n -= k;
1780 t += k;
f268980d 1781 last_index = (uint64_t) -1;
de190aef 1782 a = le64toh(array->entry_array.next_entry_array_offset);
cec736d2
LP
1783 }
1784
1785 return 0;
de190aef
LP
1786
1787found:
1788 if (subtract_one && t == 0 && i == 0)
1789 return 0;
1790
a4bcff5b 1791 /* Let's cache this item for the next invocation */
af13a6b0 1792 chain_cache_put(f->chain_cache, ci, first, a, le64toh(array->entry_array.items[0]), t, subtract_one ? (i > 0 ? i-1 : (uint64_t) -1) : i);
a4bcff5b 1793
de190aef
LP
1794 if (subtract_one && i == 0)
1795 p = last_p;
1796 else if (subtract_one)
1797 p = le64toh(array->entry_array.items[i-1]);
1798 else
1799 p = le64toh(array->entry_array.items[i]);
1800
1801 r = journal_file_move_to_object(f, OBJECT_ENTRY, p, &o);
1802 if (r < 0)
1803 return r;
1804
1805 if (ret)
1806 *ret = o;
1807
1808 if (offset)
1809 *offset = p;
1810
1811 if (idx)
cbdca852 1812 *idx = t + i + (subtract_one ? -1 : 0);
de190aef
LP
1813
1814 return 1;
cec736d2
LP
1815}
1816
f268980d
LP
1817static int generic_array_bisect_plus_one(
1818 JournalFile *f,
1819 uint64_t extra,
1820 uint64_t first,
1821 uint64_t n,
1822 uint64_t needle,
1823 int (*test_object)(JournalFile *f, uint64_t p, uint64_t needle),
1824 direction_t direction,
1825 Object **ret,
1826 uint64_t *offset,
1827 uint64_t *idx) {
de190aef 1828
cec736d2 1829 int r;
cbdca852
LP
1830 bool step_back = false;
1831 Object *o;
cec736d2
LP
1832
1833 assert(f);
de190aef 1834 assert(test_object);
cec736d2 1835
de190aef
LP
1836 if (n <= 0)
1837 return 0;
cec736d2 1838
de190aef
LP
1839 /* This bisects the array in object 'first', but first checks
1840 * an extra */
de190aef
LP
1841 r = test_object(f, extra, needle);
1842 if (r < 0)
1843 return r;
a536e261
LP
1844
1845 if (r == TEST_FOUND)
1846 r = direction == DIRECTION_DOWN ? TEST_RIGHT : TEST_LEFT;
1847
cbdca852
LP
1848 /* if we are looking with DIRECTION_UP then we need to first
1849 see if in the actual array there is a matching entry, and
1850 return the last one of that. But if there isn't any we need
1851 to return this one. Hence remember this, and return it
1852 below. */
1853 if (r == TEST_LEFT)
1854 step_back = direction == DIRECTION_UP;
de190aef 1855
cbdca852
LP
1856 if (r == TEST_RIGHT) {
1857 if (direction == DIRECTION_DOWN)
1858 goto found;
1859 else
1860 return 0;
a536e261 1861 }
cec736d2 1862
de190aef
LP
1863 r = generic_array_bisect(f, first, n-1, needle, test_object, direction, ret, offset, idx);
1864
cbdca852
LP
1865 if (r == 0 && step_back)
1866 goto found;
1867
ecf68b1d 1868 if (r > 0 && idx)
de190aef
LP
1869 (*idx) ++;
1870
1871 return r;
cbdca852
LP
1872
1873found:
1874 r = journal_file_move_to_object(f, OBJECT_ENTRY, extra, &o);
1875 if (r < 0)
1876 return r;
1877
1878 if (ret)
1879 *ret = o;
1880
1881 if (offset)
1882 *offset = extra;
1883
1884 if (idx)
1885 *idx = 0;
1886
1887 return 1;
1888}
1889
44a6b1b6 1890_pure_ static int test_object_offset(JournalFile *f, uint64_t p, uint64_t needle) {
cbdca852
LP
1891 assert(f);
1892 assert(p > 0);
1893
1894 if (p == needle)
1895 return TEST_FOUND;
1896 else if (p < needle)
1897 return TEST_LEFT;
1898 else
1899 return TEST_RIGHT;
1900}
1901
de190aef
LP
1902static int test_object_seqnum(JournalFile *f, uint64_t p, uint64_t needle) {
1903 Object *o;
1904 int r;
1905
1906 assert(f);
1907 assert(p > 0);
1908
1909 r = journal_file_move_to_object(f, OBJECT_ENTRY, p, &o);
cec736d2
LP
1910 if (r < 0)
1911 return r;
1912
de190aef
LP
1913 if (le64toh(o->entry.seqnum) == needle)
1914 return TEST_FOUND;
1915 else if (le64toh(o->entry.seqnum) < needle)
1916 return TEST_LEFT;
1917 else
1918 return TEST_RIGHT;
1919}
cec736d2 1920
de190aef
LP
1921int journal_file_move_to_entry_by_seqnum(
1922 JournalFile *f,
1923 uint64_t seqnum,
1924 direction_t direction,
1925 Object **ret,
1926 uint64_t *offset) {
1927
1928 return generic_array_bisect(f,
1929 le64toh(f->header->entry_array_offset),
1930 le64toh(f->header->n_entries),
1931 seqnum,
1932 test_object_seqnum,
1933 direction,
1934 ret, offset, NULL);
1935}
cec736d2 1936
de190aef
LP
1937static int test_object_realtime(JournalFile *f, uint64_t p, uint64_t needle) {
1938 Object *o;
1939 int r;
1940
1941 assert(f);
1942 assert(p > 0);
1943
1944 r = journal_file_move_to_object(f, OBJECT_ENTRY, p, &o);
1945 if (r < 0)
1946 return r;
1947
1948 if (le64toh(o->entry.realtime) == needle)
1949 return TEST_FOUND;
1950 else if (le64toh(o->entry.realtime) < needle)
1951 return TEST_LEFT;
1952 else
1953 return TEST_RIGHT;
cec736d2
LP
1954}
1955
de190aef
LP
1956int journal_file_move_to_entry_by_realtime(
1957 JournalFile *f,
1958 uint64_t realtime,
1959 direction_t direction,
1960 Object **ret,
1961 uint64_t *offset) {
1962
1963 return generic_array_bisect(f,
1964 le64toh(f->header->entry_array_offset),
1965 le64toh(f->header->n_entries),
1966 realtime,
1967 test_object_realtime,
1968 direction,
1969 ret, offset, NULL);
1970}
1971
1972static int test_object_monotonic(JournalFile *f, uint64_t p, uint64_t needle) {
1973 Object *o;
1974 int r;
1975
1976 assert(f);
1977 assert(p > 0);
1978
1979 r = journal_file_move_to_object(f, OBJECT_ENTRY, p, &o);
1980 if (r < 0)
1981 return r;
1982
1983 if (le64toh(o->entry.monotonic) == needle)
1984 return TEST_FOUND;
1985 else if (le64toh(o->entry.monotonic) < needle)
1986 return TEST_LEFT;
1987 else
1988 return TEST_RIGHT;
1989}
1990
2a560338 1991static int find_data_object_by_boot_id(
47838ab3
ZJS
1992 JournalFile *f,
1993 sd_id128_t boot_id,
1994 Object **o,
1995 uint64_t *b) {
2a560338 1996
47838ab3
ZJS
1997 char t[sizeof("_BOOT_ID=")-1 + 32 + 1] = "_BOOT_ID=";
1998
1999 sd_id128_to_string(boot_id, t + 9);
2000 return journal_file_find_data_object(f, t, sizeof(t) - 1, o, b);
2001}
2002
de190aef
LP
2003int journal_file_move_to_entry_by_monotonic(
2004 JournalFile *f,
2005 sd_id128_t boot_id,
2006 uint64_t monotonic,
2007 direction_t direction,
2008 Object **ret,
2009 uint64_t *offset) {
2010
de190aef
LP
2011 Object *o;
2012 int r;
2013
cbdca852 2014 assert(f);
de190aef 2015
47838ab3 2016 r = find_data_object_by_boot_id(f, boot_id, &o, NULL);
de190aef
LP
2017 if (r < 0)
2018 return r;
cbdca852 2019 if (r == 0)
de190aef
LP
2020 return -ENOENT;
2021
2022 return generic_array_bisect_plus_one(f,
2023 le64toh(o->data.entry_offset),
2024 le64toh(o->data.entry_array_offset),
2025 le64toh(o->data.n_entries),
2026 monotonic,
2027 test_object_monotonic,
2028 direction,
2029 ret, offset, NULL);
2030}
2031
1fc605b0 2032void journal_file_reset_location(JournalFile *f) {
6573ef05 2033 f->location_type = LOCATION_HEAD;
1fc605b0 2034 f->current_offset = 0;
6573ef05
MS
2035 f->current_seqnum = 0;
2036 f->current_realtime = 0;
2037 f->current_monotonic = 0;
2038 zero(f->current_boot_id);
2039 f->current_xor_hash = 0;
2040}
2041
950c07d4 2042void journal_file_save_location(JournalFile *f, Object *o, uint64_t offset) {
6573ef05
MS
2043 f->location_type = LOCATION_SEEK;
2044 f->current_offset = offset;
2045 f->current_seqnum = le64toh(o->entry.seqnum);
2046 f->current_realtime = le64toh(o->entry.realtime);
2047 f->current_monotonic = le64toh(o->entry.monotonic);
2048 f->current_boot_id = o->entry.boot_id;
2049 f->current_xor_hash = le64toh(o->entry.xor_hash);
1fc605b0
MS
2050}
2051
d8ae66d7
MS
2052int journal_file_compare_locations(JournalFile *af, JournalFile *bf) {
2053 assert(af);
2054 assert(bf);
2055 assert(af->location_type == LOCATION_SEEK);
2056 assert(bf->location_type == LOCATION_SEEK);
2057
2058 /* If contents and timestamps match, these entries are
2059 * identical, even if the seqnum does not match */
2060 if (sd_id128_equal(af->current_boot_id, bf->current_boot_id) &&
2061 af->current_monotonic == bf->current_monotonic &&
2062 af->current_realtime == bf->current_realtime &&
2063 af->current_xor_hash == bf->current_xor_hash)
2064 return 0;
2065
2066 if (sd_id128_equal(af->header->seqnum_id, bf->header->seqnum_id)) {
2067
2068 /* If this is from the same seqnum source, compare
2069 * seqnums */
2070 if (af->current_seqnum < bf->current_seqnum)
2071 return -1;
2072 if (af->current_seqnum > bf->current_seqnum)
2073 return 1;
2074
2075 /* Wow! This is weird, different data but the same
2076 * seqnums? Something is borked, but let's make the
2077 * best of it and compare by time. */
2078 }
2079
2080 if (sd_id128_equal(af->current_boot_id, bf->current_boot_id)) {
2081
2082 /* If the boot id matches, compare monotonic time */
2083 if (af->current_monotonic < bf->current_monotonic)
2084 return -1;
2085 if (af->current_monotonic > bf->current_monotonic)
2086 return 1;
2087 }
2088
2089 /* Otherwise, compare UTC time */
2090 if (af->current_realtime < bf->current_realtime)
2091 return -1;
2092 if (af->current_realtime > bf->current_realtime)
2093 return 1;
2094
2095 /* Finally, compare by contents */
2096 if (af->current_xor_hash < bf->current_xor_hash)
2097 return -1;
2098 if (af->current_xor_hash > bf->current_xor_hash)
2099 return 1;
2100
2101 return 0;
2102}
2103
de190aef
LP
2104int journal_file_next_entry(
2105 JournalFile *f,
f534928a 2106 uint64_t p,
de190aef
LP
2107 direction_t direction,
2108 Object **ret, uint64_t *offset) {
2109
fb099c8d 2110 uint64_t i, n, ofs;
cec736d2
LP
2111 int r;
2112
2113 assert(f);
de190aef
LP
2114
2115 n = le64toh(f->header->n_entries);
2116 if (n <= 0)
2117 return 0;
cec736d2 2118
f534928a 2119 if (p == 0)
de190aef 2120 i = direction == DIRECTION_DOWN ? 0 : n - 1;
cec736d2 2121 else {
de190aef
LP
2122 r = generic_array_bisect(f,
2123 le64toh(f->header->entry_array_offset),
2124 le64toh(f->header->n_entries),
2125 p,
2126 test_object_offset,
2127 DIRECTION_DOWN,
2128 NULL, NULL,
2129 &i);
2130 if (r <= 0)
2131 return r;
2132
2133 if (direction == DIRECTION_DOWN) {
2134 if (i >= n - 1)
2135 return 0;
2136
2137 i++;
2138 } else {
2139 if (i <= 0)
2140 return 0;
2141
2142 i--;
2143 }
cec736d2
LP
2144 }
2145
de190aef 2146 /* And jump to it */
fb099c8d
ZJS
2147 r = generic_array_get(f,
2148 le64toh(f->header->entry_array_offset),
2149 i,
2150 ret, &ofs);
2151 if (r <= 0)
2152 return r;
2153
2154 if (p > 0 &&
2155 (direction == DIRECTION_DOWN ? ofs <= p : ofs >= p)) {
2156 log_debug("%s: entry array corrupted at entry %"PRIu64,
2157 f->path, i);
2158 return -EBADMSG;
2159 }
2160
2161 if (offset)
2162 *offset = ofs;
2163
2164 return 1;
de190aef 2165}
cec736d2 2166
de190aef
LP
2167int journal_file_next_entry_for_data(
2168 JournalFile *f,
2169 Object *o, uint64_t p,
2170 uint64_t data_offset,
2171 direction_t direction,
2172 Object **ret, uint64_t *offset) {
2173
2174 uint64_t n, i;
cec736d2 2175 int r;
de190aef 2176 Object *d;
cec736d2
LP
2177
2178 assert(f);
de190aef 2179 assert(p > 0 || !o);
cec736d2 2180
de190aef 2181 r = journal_file_move_to_object(f, OBJECT_DATA, data_offset, &d);
466ccd92 2182 if (r < 0)
de190aef 2183 return r;
cec736d2 2184
de190aef
LP
2185 n = le64toh(d->data.n_entries);
2186 if (n <= 0)
2187 return n;
cec736d2 2188
de190aef
LP
2189 if (!o)
2190 i = direction == DIRECTION_DOWN ? 0 : n - 1;
2191 else {
2192 if (o->object.type != OBJECT_ENTRY)
2193 return -EINVAL;
cec736d2 2194
de190aef
LP
2195 r = generic_array_bisect_plus_one(f,
2196 le64toh(d->data.entry_offset),
2197 le64toh(d->data.entry_array_offset),
2198 le64toh(d->data.n_entries),
2199 p,
2200 test_object_offset,
2201 DIRECTION_DOWN,
2202 NULL, NULL,
2203 &i);
2204
2205 if (r <= 0)
cec736d2
LP
2206 return r;
2207
de190aef
LP
2208 if (direction == DIRECTION_DOWN) {
2209 if (i >= n - 1)
2210 return 0;
cec736d2 2211
de190aef
LP
2212 i++;
2213 } else {
2214 if (i <= 0)
2215 return 0;
cec736d2 2216
de190aef
LP
2217 i--;
2218 }
cec736d2 2219
de190aef 2220 }
cec736d2 2221
de190aef
LP
2222 return generic_array_get_plus_one(f,
2223 le64toh(d->data.entry_offset),
2224 le64toh(d->data.entry_array_offset),
2225 i,
2226 ret, offset);
2227}
cec736d2 2228
cbdca852
LP
2229int journal_file_move_to_entry_by_offset_for_data(
2230 JournalFile *f,
2231 uint64_t data_offset,
2232 uint64_t p,
2233 direction_t direction,
2234 Object **ret, uint64_t *offset) {
2235
2236 int r;
2237 Object *d;
2238
2239 assert(f);
2240
2241 r = journal_file_move_to_object(f, OBJECT_DATA, data_offset, &d);
2242 if (r < 0)
2243 return r;
2244
2245 return generic_array_bisect_plus_one(f,
2246 le64toh(d->data.entry_offset),
2247 le64toh(d->data.entry_array_offset),
2248 le64toh(d->data.n_entries),
2249 p,
2250 test_object_offset,
2251 direction,
2252 ret, offset, NULL);
2253}
2254
2255int journal_file_move_to_entry_by_monotonic_for_data(
2256 JournalFile *f,
2257 uint64_t data_offset,
2258 sd_id128_t boot_id,
2259 uint64_t monotonic,
2260 direction_t direction,
2261 Object **ret, uint64_t *offset) {
2262
cbdca852
LP
2263 Object *o, *d;
2264 int r;
2265 uint64_t b, z;
2266
2267 assert(f);
2268
2269 /* First, seek by time */
47838ab3 2270 r = find_data_object_by_boot_id(f, boot_id, &o, &b);
cbdca852
LP
2271 if (r < 0)
2272 return r;
2273 if (r == 0)
2274 return -ENOENT;
2275
2276 r = generic_array_bisect_plus_one(f,
2277 le64toh(o->data.entry_offset),
2278 le64toh(o->data.entry_array_offset),
2279 le64toh(o->data.n_entries),
2280 monotonic,
2281 test_object_monotonic,
2282 direction,
2283 NULL, &z, NULL);
2284 if (r <= 0)
2285 return r;
2286
2287 /* And now, continue seeking until we find an entry that
2288 * exists in both bisection arrays */
2289
2290 for (;;) {
2291 Object *qo;
2292 uint64_t p, q;
2293
2294 r = journal_file_move_to_object(f, OBJECT_DATA, data_offset, &d);
2295 if (r < 0)
2296 return r;
2297
2298 r = generic_array_bisect_plus_one(f,
2299 le64toh(d->data.entry_offset),
2300 le64toh(d->data.entry_array_offset),
2301 le64toh(d->data.n_entries),
2302 z,
2303 test_object_offset,
2304 direction,
2305 NULL, &p, NULL);
2306 if (r <= 0)
2307 return r;
2308
2309 r = journal_file_move_to_object(f, OBJECT_DATA, b, &o);
2310 if (r < 0)
2311 return r;
2312
2313 r = generic_array_bisect_plus_one(f,
2314 le64toh(o->data.entry_offset),
2315 le64toh(o->data.entry_array_offset),
2316 le64toh(o->data.n_entries),
2317 p,
2318 test_object_offset,
2319 direction,
2320 &qo, &q, NULL);
2321
2322 if (r <= 0)
2323 return r;
2324
2325 if (p == q) {
2326 if (ret)
2327 *ret = qo;
2328 if (offset)
2329 *offset = q;
2330
2331 return 1;
2332 }
2333
2334 z = q;
2335 }
cbdca852
LP
2336}
2337
de190aef
LP
2338int journal_file_move_to_entry_by_seqnum_for_data(
2339 JournalFile *f,
2340 uint64_t data_offset,
2341 uint64_t seqnum,
2342 direction_t direction,
2343 Object **ret, uint64_t *offset) {
cec736d2 2344
de190aef
LP
2345 Object *d;
2346 int r;
cec736d2 2347
91a31dde
LP
2348 assert(f);
2349
de190aef 2350 r = journal_file_move_to_object(f, OBJECT_DATA, data_offset, &d);
91a31dde 2351 if (r < 0)
de190aef 2352 return r;
cec736d2 2353
de190aef
LP
2354 return generic_array_bisect_plus_one(f,
2355 le64toh(d->data.entry_offset),
2356 le64toh(d->data.entry_array_offset),
2357 le64toh(d->data.n_entries),
2358 seqnum,
2359 test_object_seqnum,
2360 direction,
2361 ret, offset, NULL);
2362}
cec736d2 2363
de190aef
LP
2364int journal_file_move_to_entry_by_realtime_for_data(
2365 JournalFile *f,
2366 uint64_t data_offset,
2367 uint64_t realtime,
2368 direction_t direction,
2369 Object **ret, uint64_t *offset) {
2370
2371 Object *d;
2372 int r;
2373
91a31dde
LP
2374 assert(f);
2375
de190aef 2376 r = journal_file_move_to_object(f, OBJECT_DATA, data_offset, &d);
91a31dde 2377 if (r < 0)
de190aef
LP
2378 return r;
2379
2380 return generic_array_bisect_plus_one(f,
2381 le64toh(d->data.entry_offset),
2382 le64toh(d->data.entry_array_offset),
2383 le64toh(d->data.n_entries),
2384 realtime,
2385 test_object_realtime,
2386 direction,
2387 ret, offset, NULL);
cec736d2
LP
2388}
2389
0284adc6 2390void journal_file_dump(JournalFile *f) {
7560fffc 2391 Object *o;
7560fffc 2392 int r;
0284adc6 2393 uint64_t p;
7560fffc
LP
2394
2395 assert(f);
2396
0284adc6 2397 journal_file_print_header(f);
7560fffc 2398
0284adc6
LP
2399 p = le64toh(f->header->header_size);
2400 while (p != 0) {
d05089d8 2401 r = journal_file_move_to_object(f, OBJECT_UNUSED, p, &o);
0284adc6
LP
2402 if (r < 0)
2403 goto fail;
7560fffc 2404
0284adc6 2405 switch (o->object.type) {
d98cc1f2 2406
0284adc6
LP
2407 case OBJECT_UNUSED:
2408 printf("Type: OBJECT_UNUSED\n");
2409 break;
d98cc1f2 2410
0284adc6
LP
2411 case OBJECT_DATA:
2412 printf("Type: OBJECT_DATA\n");
2413 break;
7560fffc 2414
3c1668da
LP
2415 case OBJECT_FIELD:
2416 printf("Type: OBJECT_FIELD\n");
2417 break;
2418
0284adc6 2419 case OBJECT_ENTRY:
507f22bd
ZJS
2420 printf("Type: OBJECT_ENTRY seqnum=%"PRIu64" monotonic=%"PRIu64" realtime=%"PRIu64"\n",
2421 le64toh(o->entry.seqnum),
2422 le64toh(o->entry.monotonic),
2423 le64toh(o->entry.realtime));
0284adc6 2424 break;
7560fffc 2425
0284adc6
LP
2426 case OBJECT_FIELD_HASH_TABLE:
2427 printf("Type: OBJECT_FIELD_HASH_TABLE\n");
2428 break;
7560fffc 2429
0284adc6
LP
2430 case OBJECT_DATA_HASH_TABLE:
2431 printf("Type: OBJECT_DATA_HASH_TABLE\n");
2432 break;
7560fffc 2433
0284adc6
LP
2434 case OBJECT_ENTRY_ARRAY:
2435 printf("Type: OBJECT_ENTRY_ARRAY\n");
2436 break;
7560fffc 2437
0284adc6 2438 case OBJECT_TAG:
507f22bd
ZJS
2439 printf("Type: OBJECT_TAG seqnum=%"PRIu64" epoch=%"PRIu64"\n",
2440 le64toh(o->tag.seqnum),
2441 le64toh(o->tag.epoch));
0284adc6 2442 break;
3c1668da
LP
2443
2444 default:
8facc349 2445 printf("Type: unknown (%i)\n", o->object.type);
3c1668da 2446 break;
0284adc6 2447 }
7560fffc 2448
d89c8fdf
ZJS
2449 if (o->object.flags & OBJECT_COMPRESSION_MASK)
2450 printf("Flags: %s\n",
2451 object_compressed_to_string(o->object.flags & OBJECT_COMPRESSION_MASK));
7560fffc 2452
0284adc6
LP
2453 if (p == le64toh(f->header->tail_object_offset))
2454 p = 0;
2455 else
2456 p = p + ALIGN64(le64toh(o->object.size));
2457 }
7560fffc 2458
0284adc6
LP
2459 return;
2460fail:
2461 log_error("File corrupt");
7560fffc
LP
2462}
2463
718fe4b1
ZJS
2464static const char* format_timestamp_safe(char *buf, size_t l, usec_t t) {
2465 const char *x;
2466
2467 x = format_timestamp(buf, l, t);
2468 if (x)
2469 return x;
2470 return " --- ";
2471}
2472
0284adc6 2473void journal_file_print_header(JournalFile *f) {
2765b7bb 2474 char a[33], b[33], c[33], d[33];
ed375beb 2475 char x[FORMAT_TIMESTAMP_MAX], y[FORMAT_TIMESTAMP_MAX], z[FORMAT_TIMESTAMP_MAX];
a1a03e30
LP
2476 struct stat st;
2477 char bytes[FORMAT_BYTES_MAX];
7560fffc
LP
2478
2479 assert(f);
7560fffc 2480
0284adc6
LP
2481 printf("File Path: %s\n"
2482 "File ID: %s\n"
2483 "Machine ID: %s\n"
2484 "Boot ID: %s\n"
2485 "Sequential Number ID: %s\n"
2486 "State: %s\n"
2487 "Compatible Flags:%s%s\n"
d89c8fdf 2488 "Incompatible Flags:%s%s%s\n"
507f22bd
ZJS
2489 "Header size: %"PRIu64"\n"
2490 "Arena size: %"PRIu64"\n"
2491 "Data Hash Table Size: %"PRIu64"\n"
2492 "Field Hash Table Size: %"PRIu64"\n"
0284adc6 2493 "Rotate Suggested: %s\n"
507f22bd
ZJS
2494 "Head Sequential Number: %"PRIu64"\n"
2495 "Tail Sequential Number: %"PRIu64"\n"
0284adc6 2496 "Head Realtime Timestamp: %s\n"
3223f44f 2497 "Tail Realtime Timestamp: %s\n"
ed375beb 2498 "Tail Monotonic Timestamp: %s\n"
507f22bd
ZJS
2499 "Objects: %"PRIu64"\n"
2500 "Entry Objects: %"PRIu64"\n",
0284adc6
LP
2501 f->path,
2502 sd_id128_to_string(f->header->file_id, a),
2503 sd_id128_to_string(f->header->machine_id, b),
2504 sd_id128_to_string(f->header->boot_id, c),
2765b7bb 2505 sd_id128_to_string(f->header->seqnum_id, d),
3223f44f
LP
2506 f->header->state == STATE_OFFLINE ? "OFFLINE" :
2507 f->header->state == STATE_ONLINE ? "ONLINE" :
2508 f->header->state == STATE_ARCHIVED ? "ARCHIVED" : "UNKNOWN",
8088cbd3 2509 JOURNAL_HEADER_SEALED(f->header) ? " SEALED" : "",
d89c8fdf
ZJS
2510 (le32toh(f->header->compatible_flags) & ~HEADER_COMPATIBLE_ANY) ? " ???" : "",
2511 JOURNAL_HEADER_COMPRESSED_XZ(f->header) ? " COMPRESSED-XZ" : "",
2512 JOURNAL_HEADER_COMPRESSED_LZ4(f->header) ? " COMPRESSED-LZ4" : "",
2513 (le32toh(f->header->incompatible_flags) & ~HEADER_INCOMPATIBLE_ANY) ? " ???" : "",
507f22bd
ZJS
2514 le64toh(f->header->header_size),
2515 le64toh(f->header->arena_size),
2516 le64toh(f->header->data_hash_table_size) / sizeof(HashItem),
2517 le64toh(f->header->field_hash_table_size) / sizeof(HashItem),
fb0951b0 2518 yes_no(journal_file_rotate_suggested(f, 0)),
507f22bd
ZJS
2519 le64toh(f->header->head_entry_seqnum),
2520 le64toh(f->header->tail_entry_seqnum),
718fe4b1
ZJS
2521 format_timestamp_safe(x, sizeof(x), le64toh(f->header->head_entry_realtime)),
2522 format_timestamp_safe(y, sizeof(y), le64toh(f->header->tail_entry_realtime)),
ed375beb 2523 format_timespan(z, sizeof(z), le64toh(f->header->tail_entry_monotonic), USEC_PER_MSEC),
507f22bd
ZJS
2524 le64toh(f->header->n_objects),
2525 le64toh(f->header->n_entries));
7560fffc 2526
0284adc6 2527 if (JOURNAL_HEADER_CONTAINS(f->header, n_data))
507f22bd 2528 printf("Data Objects: %"PRIu64"\n"
0284adc6 2529 "Data Hash Table Fill: %.1f%%\n",
507f22bd 2530 le64toh(f->header->n_data),
0284adc6 2531 100.0 * (double) le64toh(f->header->n_data) / ((double) (le64toh(f->header->data_hash_table_size) / sizeof(HashItem))));
7560fffc 2532
0284adc6 2533 if (JOURNAL_HEADER_CONTAINS(f->header, n_fields))
507f22bd 2534 printf("Field Objects: %"PRIu64"\n"
0284adc6 2535 "Field Hash Table Fill: %.1f%%\n",
507f22bd 2536 le64toh(f->header->n_fields),
0284adc6 2537 100.0 * (double) le64toh(f->header->n_fields) / ((double) (le64toh(f->header->field_hash_table_size) / sizeof(HashItem))));
3223f44f
LP
2538
2539 if (JOURNAL_HEADER_CONTAINS(f->header, n_tags))
507f22bd
ZJS
2540 printf("Tag Objects: %"PRIu64"\n",
2541 le64toh(f->header->n_tags));
3223f44f 2542 if (JOURNAL_HEADER_CONTAINS(f->header, n_entry_arrays))
507f22bd
ZJS
2543 printf("Entry Array Objects: %"PRIu64"\n",
2544 le64toh(f->header->n_entry_arrays));
a1a03e30
LP
2545
2546 if (fstat(f->fd, &st) >= 0)
59f448cf 2547 printf("Disk usage: %s\n", format_bytes(bytes, sizeof(bytes), (uint64_t) st.st_blocks * 512ULL));
7560fffc
LP
2548}
2549
fc68c929
LP
2550static int journal_file_warn_btrfs(JournalFile *f) {
2551 unsigned attrs;
2552 int r;
2553
2554 assert(f);
2555
2556 /* Before we write anything, check if the COW logic is turned
2557 * off on btrfs. Given our write pattern that is quite
2558 * unfriendly to COW file systems this should greatly improve
2559 * performance on COW file systems, such as btrfs, at the
2560 * expense of data integrity features (which shouldn't be too
2561 * bad, given that we do our own checksumming). */
2562
2563 r = btrfs_is_filesystem(f->fd);
2564 if (r < 0)
2565 return log_warning_errno(r, "Failed to determine if journal is on btrfs: %m");
2566 if (!r)
2567 return 0;
2568
2569 r = read_attr_fd(f->fd, &attrs);
2570 if (r < 0)
2571 return log_warning_errno(r, "Failed to read file attributes: %m");
2572
2573 if (attrs & FS_NOCOW_FL) {
2574 log_debug("Detected btrfs file system with copy-on-write disabled, all is good.");
2575 return 0;
2576 }
2577
2578 log_notice("Creating journal file %s on a btrfs file system, and copy-on-write is enabled. "
2579 "This is likely to slow down journal access substantially, please consider turning "
2580 "off the copy-on-write file attribute on the journal directory, using chattr +C.", f->path);
2581
2582 return 1;
2583}
2584
0284adc6
LP
2585int journal_file_open(
2586 const char *fname,
2587 int flags,
2588 mode_t mode,
2589 bool compress,
baed47c3 2590 bool seal,
0284adc6
LP
2591 JournalMetrics *metrics,
2592 MMapCache *mmap_cache,
2593 JournalFile *template,
2594 JournalFile **ret) {
7560fffc 2595
fa6ac760 2596 bool newly_created = false;
0284adc6 2597 JournalFile *f;
fa6ac760 2598 void *h;
0284adc6 2599 int r;
7560fffc 2600
0284adc6 2601 assert(fname);
0559d3a5 2602 assert(ret);
7560fffc 2603
0284adc6
LP
2604 if ((flags & O_ACCMODE) != O_RDONLY &&
2605 (flags & O_ACCMODE) != O_RDWR)
2606 return -EINVAL;
7560fffc 2607
a0108012
LP
2608 if (!endswith(fname, ".journal") &&
2609 !endswith(fname, ".journal~"))
0284adc6 2610 return -EINVAL;
7560fffc 2611
0284adc6
LP
2612 f = new0(JournalFile, 1);
2613 if (!f)
2614 return -ENOMEM;
7560fffc 2615
0284adc6
LP
2616 f->fd = -1;
2617 f->mode = mode;
7560fffc 2618
0284adc6
LP
2619 f->flags = flags;
2620 f->prot = prot_from_flags(flags);
2621 f->writable = (flags & O_ACCMODE) != O_RDONLY;
92261977 2622#if defined(HAVE_LZ4)
d89c8fdf
ZJS
2623 f->compress_lz4 = compress;
2624#elif defined(HAVE_XZ)
2625 f->compress_xz = compress;
48b61739 2626#endif
49a32d43 2627#ifdef HAVE_GCRYPT
baed47c3 2628 f->seal = seal;
49a32d43 2629#endif
7560fffc 2630
0284adc6
LP
2631 if (mmap_cache)
2632 f->mmap = mmap_cache_ref(mmap_cache);
2633 else {
84168d80 2634 f->mmap = mmap_cache_new();
0284adc6
LP
2635 if (!f->mmap) {
2636 r = -ENOMEM;
2637 goto fail;
2638 }
2639 }
7560fffc 2640
0284adc6
LP
2641 f->path = strdup(fname);
2642 if (!f->path) {
2643 r = -ENOMEM;
2644 goto fail;
2645 }
7560fffc 2646
4743015d 2647 f->chain_cache = ordered_hashmap_new(&uint64_hash_ops);
a4bcff5b
LP
2648 if (!f->chain_cache) {
2649 r = -ENOMEM;
2650 goto fail;
2651 }
2652
0284adc6
LP
2653 f->fd = open(f->path, f->flags|O_CLOEXEC, f->mode);
2654 if (f->fd < 0) {
2655 r = -errno;
2656 goto fail;
7560fffc 2657 }
7560fffc 2658
2678031a
LP
2659 r = journal_file_fstat(f);
2660 if (r < 0)
0284adc6 2661 goto fail;
7560fffc 2662
0284adc6 2663 if (f->last_stat.st_size == 0 && f->writable) {
11689d2a 2664
fc68c929 2665 (void) journal_file_warn_btrfs(f);
11689d2a 2666
fb0951b0
LP
2667 /* Let's attach the creation time to the journal file,
2668 * so that the vacuuming code knows the age of this
2669 * file even if the file might end up corrupted one
2670 * day... Ideally we'd just use the creation time many
2671 * file systems maintain for each file, but there is
2672 * currently no usable API to query this, hence let's
2673 * emulate this via extended attributes. If extended
2674 * attributes are not supported we'll just skip this,
7517e174 2675 * and rely solely on mtime/atime/ctime of the file. */
fb0951b0 2676
d61b600d 2677 fd_setcrtime(f->fd, 0);
7560fffc 2678
feb12d3e 2679#ifdef HAVE_GCRYPT
0284adc6 2680 /* Try to load the FSPRG state, and if we can't, then
baed47c3 2681 * just don't do sealing */
49a32d43
LP
2682 if (f->seal) {
2683 r = journal_file_fss_load(f);
2684 if (r < 0)
2685 f->seal = false;
2686 }
feb12d3e 2687#endif
7560fffc 2688
0284adc6
LP
2689 r = journal_file_init_header(f, template);
2690 if (r < 0)
2691 goto fail;
7560fffc 2692
2678031a
LP
2693 r = journal_file_fstat(f);
2694 if (r < 0)
0284adc6 2695 goto fail;
fb0951b0
LP
2696
2697 newly_created = true;
0284adc6 2698 }
7560fffc 2699
0284adc6
LP
2700 if (f->last_stat.st_size < (off_t) HEADER_SIZE_MIN) {
2701 r = -EIO;
2702 goto fail;
2703 }
7560fffc 2704
fa6ac760 2705 r = mmap_cache_get(f->mmap, f->fd, f->prot, CONTEXT_HEADER, true, 0, PAGE_ALIGN(sizeof(Header)), &f->last_stat, &h);
977eaa1e 2706 if (r < 0)
0284adc6 2707 goto fail;
7560fffc 2708
fa6ac760
LP
2709 f->header = h;
2710
0284adc6
LP
2711 if (!newly_created) {
2712 r = journal_file_verify_header(f);
2713 if (r < 0)
2714 goto fail;
2715 }
7560fffc 2716
feb12d3e 2717#ifdef HAVE_GCRYPT
0284adc6 2718 if (!newly_created && f->writable) {
baed47c3 2719 r = journal_file_fss_load(f);
0284adc6
LP
2720 if (r < 0)
2721 goto fail;
2722 }
feb12d3e 2723#endif
cec736d2
LP
2724
2725 if (f->writable) {
4a92baf3
LP
2726 if (metrics) {
2727 journal_default_metrics(metrics, f->fd);
2728 f->metrics = *metrics;
2729 } else if (template)
2730 f->metrics = template->metrics;
2731
cec736d2
LP
2732 r = journal_file_refresh_header(f);
2733 if (r < 0)
2734 goto fail;
2735 }
2736
feb12d3e 2737#ifdef HAVE_GCRYPT
baed47c3 2738 r = journal_file_hmac_setup(f);
14d10188
LP
2739 if (r < 0)
2740 goto fail;
feb12d3e 2741#endif
14d10188 2742
cec736d2 2743 if (newly_created) {
de190aef 2744 r = journal_file_setup_field_hash_table(f);
cec736d2
LP
2745 if (r < 0)
2746 goto fail;
2747
de190aef 2748 r = journal_file_setup_data_hash_table(f);
cec736d2
LP
2749 if (r < 0)
2750 goto fail;
7560fffc 2751
feb12d3e 2752#ifdef HAVE_GCRYPT
7560fffc
LP
2753 r = journal_file_append_first_tag(f);
2754 if (r < 0)
2755 goto fail;
feb12d3e 2756#endif
cec736d2
LP
2757 }
2758
fa6ac760
LP
2759 if (mmap_cache_got_sigbus(f->mmap, f->fd)) {
2760 r = -EIO;
2761 goto fail;
2762 }
2763
0559d3a5 2764 *ret = f;
cec736d2
LP
2765 return 0;
2766
2767fail:
fa6ac760
LP
2768 if (f->fd >= 0 && mmap_cache_got_sigbus(f->mmap, f->fd))
2769 r = -EIO;
2770
cec736d2
LP
2771 journal_file_close(f);
2772
2773 return r;
2774}
0ac38b70 2775
baed47c3 2776int journal_file_rotate(JournalFile **f, bool compress, bool seal) {
57535f47 2777 _cleanup_free_ char *p = NULL;
0ac38b70
LP
2778 size_t l;
2779 JournalFile *old_file, *new_file = NULL;
2780 int r;
2781
2782 assert(f);
2783 assert(*f);
2784
2785 old_file = *f;
2786
2787 if (!old_file->writable)
2788 return -EINVAL;
2789
2790 if (!endswith(old_file->path, ".journal"))
2791 return -EINVAL;
2792
2793 l = strlen(old_file->path);
57535f47
ZJS
2794 r = asprintf(&p, "%.*s@" SD_ID128_FORMAT_STR "-%016"PRIx64"-%016"PRIx64".journal",
2795 (int) l - 8, old_file->path,
2796 SD_ID128_FORMAT_VAL(old_file->header->seqnum_id),
2797 le64toh((*f)->header->head_entry_seqnum),
2798 le64toh((*f)->header->head_entry_realtime));
2799 if (r < 0)
0ac38b70
LP
2800 return -ENOMEM;
2801
2678031a
LP
2802 /* Try to rename the file to the archived version. If the file
2803 * already was deleted, we'll get ENOENT, let's ignore that
2804 * case. */
0ac38b70 2805 r = rename(old_file->path, p);
2678031a 2806 if (r < 0 && errno != ENOENT)
0ac38b70
LP
2807 return -errno;
2808
ccdbaf91 2809 old_file->header->state = STATE_ARCHIVED;
0ac38b70 2810
f27a3864
LP
2811 /* Currently, btrfs is not very good with out write patterns
2812 * and fragments heavily. Let's defrag our journal files when
2813 * we archive them */
2814 old_file->defrag_on_close = true;
2815
baed47c3 2816 r = journal_file_open(old_file->path, old_file->flags, old_file->mode, compress, seal, NULL, old_file->mmap, old_file, &new_file);
0ac38b70
LP
2817 journal_file_close(old_file);
2818
2819 *f = new_file;
2820 return r;
2821}
2822
9447a7f1
LP
2823int journal_file_open_reliably(
2824 const char *fname,
2825 int flags,
2826 mode_t mode,
7560fffc 2827 bool compress,
baed47c3 2828 bool seal,
4a92baf3 2829 JournalMetrics *metrics,
27370278 2830 MMapCache *mmap_cache,
9447a7f1
LP
2831 JournalFile *template,
2832 JournalFile **ret) {
2833
2834 int r;
2835 size_t l;
ed375beb 2836 _cleanup_free_ char *p = NULL;
9447a7f1 2837
070052ab 2838 r = journal_file_open(fname, flags, mode, compress, seal, metrics, mmap_cache, template, ret);
288359db
ZJS
2839 if (!IN_SET(r,
2840 -EBADMSG, /* corrupted */
2841 -ENODATA, /* truncated */
2842 -EHOSTDOWN, /* other machine */
2843 -EPROTONOSUPPORT, /* incompatible feature */
2844 -EBUSY, /* unclean shutdown */
2845 -ESHUTDOWN, /* already archived */
2846 -EIO, /* IO error, including SIGBUS on mmap */
2847 -EIDRM /* File has been deleted */))
9447a7f1
LP
2848 return r;
2849
2850 if ((flags & O_ACCMODE) == O_RDONLY)
2851 return r;
2852
2853 if (!(flags & O_CREAT))
2854 return r;
2855
7560fffc
LP
2856 if (!endswith(fname, ".journal"))
2857 return r;
2858
5c70eab4
LP
2859 /* The file is corrupted. Rotate it away and try it again (but only once) */
2860
9447a7f1 2861 l = strlen(fname);
d587eca5 2862 if (asprintf(&p, "%.*s@%016"PRIx64 "-%016"PRIx64 ".journal~",
57535f47 2863 (int) l - 8, fname,
d587eca5 2864 now(CLOCK_REALTIME),
9bf3b535 2865 random_u64()) < 0)
9447a7f1
LP
2866 return -ENOMEM;
2867
65089b82 2868 if (rename(fname, p) < 0)
9447a7f1
LP
2869 return -errno;
2870
f27a3864
LP
2871 /* btrfs doesn't cope well with our write pattern and
2872 * fragments heavily. Let's defrag all files we rotate */
11689d2a
LP
2873
2874 (void) chattr_path(p, false, FS_NOCOW_FL);
f27a3864
LP
2875 (void) btrfs_defrag(p);
2876
65089b82 2877 log_warning_errno(r, "File %s corrupted or uncleanly shut down, renaming and replacing.", fname);
9447a7f1 2878
070052ab 2879 return journal_file_open(fname, flags, mode, compress, seal, metrics, mmap_cache, template, ret);
9447a7f1
LP
2880}
2881
cf244689
LP
2882int journal_file_copy_entry(JournalFile *from, JournalFile *to, Object *o, uint64_t p, uint64_t *seqnum, Object **ret, uint64_t *offset) {
2883 uint64_t i, n;
2884 uint64_t q, xor_hash = 0;
2885 int r;
2886 EntryItem *items;
2887 dual_timestamp ts;
2888
2889 assert(from);
2890 assert(to);
2891 assert(o);
2892 assert(p);
2893
2894 if (!to->writable)
2895 return -EPERM;
2896
2897 ts.monotonic = le64toh(o->entry.monotonic);
2898 ts.realtime = le64toh(o->entry.realtime);
2899
cf244689 2900 n = journal_file_entry_n_items(o);
4faa7004
TA
2901 /* alloca() can't take 0, hence let's allocate at least one */
2902 items = alloca(sizeof(EntryItem) * MAX(1u, n));
cf244689
LP
2903
2904 for (i = 0; i < n; i++) {
4fd052ae
FC
2905 uint64_t l, h;
2906 le64_t le_hash;
cf244689
LP
2907 size_t t;
2908 void *data;
2909 Object *u;
2910
2911 q = le64toh(o->entry.items[i].object_offset);
2912 le_hash = o->entry.items[i].hash;
2913
2914 r = journal_file_move_to_object(from, OBJECT_DATA, q, &o);
2915 if (r < 0)
2916 return r;
2917
2918 if (le_hash != o->data.hash)
2919 return -EBADMSG;
2920
2921 l = le64toh(o->object.size) - offsetof(Object, data.payload);
2922 t = (size_t) l;
2923
2924 /* We hit the limit on 32bit machines */
2925 if ((uint64_t) t != l)
2926 return -E2BIG;
2927
d89c8fdf 2928 if (o->object.flags & OBJECT_COMPRESSION_MASK) {
3b1a55e1 2929#if defined(HAVE_XZ) || defined(HAVE_LZ4)
a7f7d1bd 2930 size_t rsize = 0;
cf244689 2931
d89c8fdf
ZJS
2932 r = decompress_blob(o->object.flags & OBJECT_COMPRESSION_MASK,
2933 o->data.payload, l, &from->compress_buffer, &from->compress_buffer_size, &rsize, 0);
2934 if (r < 0)
2935 return r;
cf244689
LP
2936
2937 data = from->compress_buffer;
2938 l = rsize;
3b1a55e1
ZJS
2939#else
2940 return -EPROTONOSUPPORT;
2941#endif
cf244689
LP
2942 } else
2943 data = o->data.payload;
2944
2945 r = journal_file_append_data(to, data, l, &u, &h);
2946 if (r < 0)
2947 return r;
2948
2949 xor_hash ^= le64toh(u->data.hash);
2950 items[i].object_offset = htole64(h);
2951 items[i].hash = u->data.hash;
2952
2953 r = journal_file_move_to_object(from, OBJECT_ENTRY, p, &o);
2954 if (r < 0)
2955 return r;
2956 }
2957
fa6ac760
LP
2958 r = journal_file_append_entry_internal(to, &ts, xor_hash, items, n, seqnum, ret, offset);
2959
2960 if (mmap_cache_got_sigbus(to->mmap, to->fd))
2961 return -EIO;
2962
2963 return r;
cf244689 2964}
babfc091 2965
8580d1f7
LP
2966void journal_reset_metrics(JournalMetrics *m) {
2967 assert(m);
2968
2969 /* Set everything to "pick automatic values". */
2970
2971 *m = (JournalMetrics) {
2972 .min_use = (uint64_t) -1,
2973 .max_use = (uint64_t) -1,
2974 .min_size = (uint64_t) -1,
2975 .max_size = (uint64_t) -1,
2976 .keep_free = (uint64_t) -1,
2977 .n_max_files = (uint64_t) -1,
2978 };
2979}
2980
babfc091 2981void journal_default_metrics(JournalMetrics *m, int fd) {
8580d1f7 2982 char a[FORMAT_BYTES_MAX], b[FORMAT_BYTES_MAX], c[FORMAT_BYTES_MAX], d[FORMAT_BYTES_MAX], e[FORMAT_BYTES_MAX];
babfc091 2983 struct statvfs ss;
8580d1f7 2984 uint64_t fs_size;
babfc091
LP
2985
2986 assert(m);
2987 assert(fd >= 0);
2988
2989 if (fstatvfs(fd, &ss) >= 0)
2990 fs_size = ss.f_frsize * ss.f_blocks;
8580d1f7
LP
2991 else {
2992 log_debug_errno(errno, "Failed to detremine disk size: %m");
2993 fs_size = 0;
2994 }
babfc091
LP
2995
2996 if (m->max_use == (uint64_t) -1) {
2997
2998 if (fs_size > 0) {
2999 m->max_use = PAGE_ALIGN(fs_size / 10); /* 10% of file system size */
3000
3001 if (m->max_use > DEFAULT_MAX_USE_UPPER)
3002 m->max_use = DEFAULT_MAX_USE_UPPER;
3003
3004 if (m->max_use < DEFAULT_MAX_USE_LOWER)
3005 m->max_use = DEFAULT_MAX_USE_LOWER;
3006 } else
3007 m->max_use = DEFAULT_MAX_USE_LOWER;
3008 } else {
3009 m->max_use = PAGE_ALIGN(m->max_use);
3010
8580d1f7 3011 if (m->max_use != 0 && m->max_use < JOURNAL_FILE_SIZE_MIN*2)
babfc091
LP
3012 m->max_use = JOURNAL_FILE_SIZE_MIN*2;
3013 }
3014
8580d1f7
LP
3015 if (m->min_use == (uint64_t) -1)
3016 m->min_use = DEFAULT_MIN_USE;
3017
3018 if (m->min_use > m->max_use)
3019 m->min_use = m->max_use;
3020
babfc091
LP
3021 if (m->max_size == (uint64_t) -1) {
3022 m->max_size = PAGE_ALIGN(m->max_use / 8); /* 8 chunks */
3023
3024 if (m->max_size > DEFAULT_MAX_SIZE_UPPER)
3025 m->max_size = DEFAULT_MAX_SIZE_UPPER;
3026 } else
3027 m->max_size = PAGE_ALIGN(m->max_size);
3028
8580d1f7
LP
3029 if (m->max_size != 0) {
3030 if (m->max_size < JOURNAL_FILE_SIZE_MIN)
3031 m->max_size = JOURNAL_FILE_SIZE_MIN;
babfc091 3032
8580d1f7
LP
3033 if (m->max_use != 0 && m->max_size*2 > m->max_use)
3034 m->max_use = m->max_size*2;
3035 }
babfc091
LP
3036
3037 if (m->min_size == (uint64_t) -1)
3038 m->min_size = JOURNAL_FILE_SIZE_MIN;
3039 else {
3040 m->min_size = PAGE_ALIGN(m->min_size);
3041
3042 if (m->min_size < JOURNAL_FILE_SIZE_MIN)
3043 m->min_size = JOURNAL_FILE_SIZE_MIN;
3044
8580d1f7 3045 if (m->max_size != 0 && m->min_size > m->max_size)
babfc091
LP
3046 m->max_size = m->min_size;
3047 }
3048
3049 if (m->keep_free == (uint64_t) -1) {
3050
3051 if (fs_size > 0) {
8621b110 3052 m->keep_free = PAGE_ALIGN(fs_size * 3 / 20); /* 15% of file system size */
babfc091
LP
3053
3054 if (m->keep_free > DEFAULT_KEEP_FREE_UPPER)
3055 m->keep_free = DEFAULT_KEEP_FREE_UPPER;
3056
3057 } else
3058 m->keep_free = DEFAULT_KEEP_FREE;
3059 }
3060
8580d1f7
LP
3061 if (m->n_max_files == (uint64_t) -1)
3062 m->n_max_files = DEFAULT_N_MAX_FILES;
3063
3064 log_debug("Fixed min_use=%s max_use=%s max_size=%s min_size=%s keep_free=%s n_max_files=%" PRIu64,
3065 format_bytes(a, sizeof(a), m->min_use),
3066 format_bytes(b, sizeof(b), m->max_use),
3067 format_bytes(c, sizeof(c), m->max_size),
3068 format_bytes(d, sizeof(d), m->min_size),
3069 format_bytes(e, sizeof(e), m->keep_free),
3070 m->n_max_files);
babfc091 3071}
08984293
LP
3072
3073int journal_file_get_cutoff_realtime_usec(JournalFile *f, usec_t *from, usec_t *to) {
08984293
LP
3074 assert(f);
3075 assert(from || to);
3076
3077 if (from) {
162566a4
LP
3078 if (f->header->head_entry_realtime == 0)
3079 return -ENOENT;
08984293 3080
162566a4 3081 *from = le64toh(f->header->head_entry_realtime);
08984293
LP
3082 }
3083
3084 if (to) {
162566a4
LP
3085 if (f->header->tail_entry_realtime == 0)
3086 return -ENOENT;
08984293 3087
162566a4 3088 *to = le64toh(f->header->tail_entry_realtime);
08984293
LP
3089 }
3090
3091 return 1;
3092}
3093
3094int journal_file_get_cutoff_monotonic_usec(JournalFile *f, sd_id128_t boot_id, usec_t *from, usec_t *to) {
08984293
LP
3095 Object *o;
3096 uint64_t p;
3097 int r;
3098
3099 assert(f);
3100 assert(from || to);
3101
47838ab3 3102 r = find_data_object_by_boot_id(f, boot_id, &o, &p);
08984293
LP
3103 if (r <= 0)
3104 return r;
3105
3106 if (le64toh(o->data.n_entries) <= 0)
3107 return 0;
3108
3109 if (from) {
3110 r = journal_file_move_to_object(f, OBJECT_ENTRY, le64toh(o->data.entry_offset), &o);
3111 if (r < 0)
3112 return r;
3113
3114 *from = le64toh(o->entry.monotonic);
3115 }
3116
3117 if (to) {
3118 r = journal_file_move_to_object(f, OBJECT_DATA, p, &o);
3119 if (r < 0)
3120 return r;
3121
3122 r = generic_array_get_plus_one(f,
3123 le64toh(o->data.entry_offset),
3124 le64toh(o->data.entry_array_offset),
3125 le64toh(o->data.n_entries)-1,
3126 &o, NULL);
3127 if (r <= 0)
3128 return r;
3129
3130 *to = le64toh(o->entry.monotonic);
3131 }
3132
3133 return 1;
3134}
dca6219e 3135
fb0951b0 3136bool journal_file_rotate_suggested(JournalFile *f, usec_t max_file_usec) {
dca6219e
LP
3137 assert(f);
3138
3139 /* If we gained new header fields we gained new features,
3140 * hence suggest a rotation */
361f9cbc
LP
3141 if (le64toh(f->header->header_size) < sizeof(Header)) {
3142 log_debug("%s uses an outdated header, suggesting rotation.", f->path);
dca6219e 3143 return true;
361f9cbc 3144 }
dca6219e
LP
3145
3146 /* Let's check if the hash tables grew over a certain fill
3147 * level (75%, borrowing this value from Java's hash table
3148 * implementation), and if so suggest a rotation. To calculate
3149 * the fill level we need the n_data field, which only exists
3150 * in newer versions. */
3151
3152 if (JOURNAL_HEADER_CONTAINS(f->header, n_data))
361f9cbc 3153 if (le64toh(f->header->n_data) * 4ULL > (le64toh(f->header->data_hash_table_size) / sizeof(HashItem)) * 3ULL) {
507f22bd 3154 log_debug("Data hash table of %s has a fill level at %.1f (%"PRIu64" of %"PRIu64" items, %llu file size, %"PRIu64" bytes per hash table item), suggesting rotation.",
361f9cbc
LP
3155 f->path,
3156 100.0 * (double) le64toh(f->header->n_data) / ((double) (le64toh(f->header->data_hash_table_size) / sizeof(HashItem))),
507f22bd
ZJS
3157 le64toh(f->header->n_data),
3158 le64toh(f->header->data_hash_table_size) / sizeof(HashItem),
3159 (unsigned long long) f->last_stat.st_size,
3160 f->last_stat.st_size / le64toh(f->header->n_data));
dca6219e 3161 return true;
361f9cbc 3162 }
dca6219e
LP
3163
3164 if (JOURNAL_HEADER_CONTAINS(f->header, n_fields))
361f9cbc 3165 if (le64toh(f->header->n_fields) * 4ULL > (le64toh(f->header->field_hash_table_size) / sizeof(HashItem)) * 3ULL) {
507f22bd 3166 log_debug("Field hash table of %s has a fill level at %.1f (%"PRIu64" of %"PRIu64" items), suggesting rotation.",
361f9cbc
LP
3167 f->path,
3168 100.0 * (double) le64toh(f->header->n_fields) / ((double) (le64toh(f->header->field_hash_table_size) / sizeof(HashItem))),
507f22bd
ZJS
3169 le64toh(f->header->n_fields),
3170 le64toh(f->header->field_hash_table_size) / sizeof(HashItem));
dca6219e 3171 return true;
361f9cbc 3172 }
dca6219e 3173
0598fd4a
LP
3174 /* Are the data objects properly indexed by field objects? */
3175 if (JOURNAL_HEADER_CONTAINS(f->header, n_data) &&
3176 JOURNAL_HEADER_CONTAINS(f->header, n_fields) &&
3177 le64toh(f->header->n_data) > 0 &&
3178 le64toh(f->header->n_fields) == 0)
3179 return true;
3180
fb0951b0
LP
3181 if (max_file_usec > 0) {
3182 usec_t t, h;
3183
3184 h = le64toh(f->header->head_entry_realtime);
3185 t = now(CLOCK_REALTIME);
3186
3187 if (h > 0 && t > h + max_file_usec)
3188 return true;
3189 }
3190
dca6219e
LP
3191 return false;
3192}