]> git.ipfire.org Git - thirdparty/systemd.git/blame - src/journal/journal-file.c
journal: fix error handling when compressing journal objects
[thirdparty/systemd.git] / src / journal / journal-file.c
CommitLineData
cec736d2
LP
1/*-*- Mode: C; c-basic-offset: 8; indent-tabs-mode: nil -*-*/
2
3/***
4 This file is part of systemd.
5
6 Copyright 2011 Lennart Poettering
7
8 systemd is free software; you can redistribute it and/or modify it
5430f7f2
LP
9 under the terms of the GNU Lesser General Public License as published by
10 the Free Software Foundation; either version 2.1 of the License, or
cec736d2
LP
11 (at your option) any later version.
12
13 systemd is distributed in the hope that it will be useful, but
14 WITHOUT ANY WARRANTY; without even the implied warranty of
15 MERCHANTABILITY or FITNESS FOR A PARTICULAR PURPOSE. See the GNU
5430f7f2 16 Lesser General Public License for more details.
cec736d2 17
5430f7f2 18 You should have received a copy of the GNU Lesser General Public License
cec736d2
LP
19 along with systemd; If not, see <http://www.gnu.org/licenses/>.
20***/
21
22#include <sys/mman.h>
23#include <errno.h>
24#include <sys/uio.h>
25#include <unistd.h>
26#include <sys/statvfs.h>
27#include <fcntl.h>
28#include <stddef.h>
11689d2a 29#include <linux/fs.h>
fb0951b0 30
f27a3864 31#include "btrfs-util.h"
cec736d2
LP
32#include "journal-def.h"
33#include "journal-file.h"
0284adc6 34#include "journal-authenticate.h"
cec736d2 35#include "lookup3.h"
807e17f0 36#include "compress.h"
3df3e884 37#include "random-util.h"
cec736d2 38
4a92baf3
LP
39#define DEFAULT_DATA_HASH_TABLE_SIZE (2047ULL*sizeof(HashItem))
40#define DEFAULT_FIELD_HASH_TABLE_SIZE (333ULL*sizeof(HashItem))
cec736d2 41
be19b7df 42#define COMPRESSION_SIZE_THRESHOLD (512ULL)
807e17f0 43
babfc091 44/* This is the minimum journal file size */
253f59df 45#define JOURNAL_FILE_SIZE_MIN (4ULL*1024ULL*1024ULL) /* 4 MiB */
babfc091
LP
46
47/* These are the lower and upper bounds if we deduce the max_use value
48 * from the file system size */
49#define DEFAULT_MAX_USE_LOWER (1ULL*1024ULL*1024ULL) /* 1 MiB */
50#define DEFAULT_MAX_USE_UPPER (4ULL*1024ULL*1024ULL*1024ULL) /* 4 GiB */
51
8580d1f7
LP
52/* This is the default minimal use limit, how much we'll use even if keep_free suggests otherwise. */
53#define DEFAULT_MIN_USE (1ULL*1024ULL*1024ULL) /* 1 MiB */
54
babfc091 55/* This is the upper bound if we deduce max_size from max_use */
71100051 56#define DEFAULT_MAX_SIZE_UPPER (128ULL*1024ULL*1024ULL) /* 128 MiB */
babfc091
LP
57
58/* This is the upper bound if we deduce the keep_free value from the
59 * file system size */
60#define DEFAULT_KEEP_FREE_UPPER (4ULL*1024ULL*1024ULL*1024ULL) /* 4 GiB */
61
62/* This is the keep_free value when we can't determine the system
63 * size */
64#define DEFAULT_KEEP_FREE (1024ULL*1024ULL) /* 1 MB */
65
8580d1f7
LP
66/* This is the default maximum number of journal files to keep around. */
67#define DEFAULT_N_MAX_FILES (100)
68
dca6219e
LP
69/* n_data was the first entry we added after the initial file format design */
70#define HEADER_SIZE_MIN ALIGN64(offsetof(Header, n_data))
cec736d2 71
a4bcff5b
LP
72/* How many entries to keep in the entry array chain cache at max */
73#define CHAIN_CACHE_MAX 20
74
a676e665
LP
75/* How much to increase the journal file size at once each time we allocate something new. */
76#define FILE_SIZE_INCREASE (8ULL*1024ULL*1024ULL) /* 8MB */
77
2678031a
LP
78/* Reread fstat() of the file for detecting deletions at least this often */
79#define LAST_STAT_REFRESH_USEC (5*USEC_PER_SEC)
80
fa6ac760
LP
81/* The mmap context to use for the header we pick as one above the last defined typed */
82#define CONTEXT_HEADER _OBJECT_TYPE_MAX
83
9588bc32 84static int journal_file_set_online(JournalFile *f) {
26687bf8
OS
85 assert(f);
86
87 if (!f->writable)
88 return -EPERM;
89
90 if (!(f->fd >= 0 && f->header))
91 return -EINVAL;
92
fa6ac760
LP
93 if (mmap_cache_got_sigbus(f->mmap, f->fd))
94 return -EIO;
95
26687bf8
OS
96 switch(f->header->state) {
97 case STATE_ONLINE:
98 return 0;
99
100 case STATE_OFFLINE:
101 f->header->state = STATE_ONLINE;
102 fsync(f->fd);
103 return 0;
104
105 default:
106 return -EINVAL;
107 }
108}
109
110int journal_file_set_offline(JournalFile *f) {
111 assert(f);
112
113 if (!f->writable)
114 return -EPERM;
115
116 if (!(f->fd >= 0 && f->header))
117 return -EINVAL;
118
119 if (f->header->state != STATE_ONLINE)
120 return 0;
121
122 fsync(f->fd);
123
fa6ac760
LP
124 if (mmap_cache_got_sigbus(f->mmap, f->fd))
125 return -EIO;
126
26687bf8
OS
127 f->header->state = STATE_OFFLINE;
128
fa6ac760
LP
129 if (mmap_cache_got_sigbus(f->mmap, f->fd))
130 return -EIO;
131
26687bf8
OS
132 fsync(f->fd);
133
134 return 0;
135}
136
804ae586 137JournalFile* journal_file_close(JournalFile *f) {
de190aef 138 assert(f);
cec736d2 139
feb12d3e 140#ifdef HAVE_GCRYPT
b0af6f41 141 /* Write the final tag */
c586dbf1 142 if (f->seal && f->writable)
b0af6f41 143 journal_file_append_tag(f);
feb12d3e 144#endif
b0af6f41 145
26687bf8 146 journal_file_set_offline(f);
cec736d2 147
fa6ac760
LP
148 if (f->mmap && f->fd >= 0)
149 mmap_cache_close_fd(f->mmap, f->fd);
cec736d2 150
11689d2a
LP
151 if (f->fd >= 0 && f->defrag_on_close) {
152
153 /* Be friendly to btrfs: turn COW back on again now,
154 * and defragment the file. We won't write to the file
155 * ever again, hence remove all fragmentation, and
156 * reenable all the good bits COW usually provides
157 * (such as data checksumming). */
158
1ed8f8c1 159 (void) chattr_fd(f->fd, 0, FS_NOCOW_FL);
11689d2a
LP
160 (void) btrfs_defrag_fd(f->fd);
161 }
f27a3864 162
03e334a1 163 safe_close(f->fd);
cec736d2 164 free(f->path);
807e17f0 165
16e9f408
LP
166 if (f->mmap)
167 mmap_cache_unref(f->mmap);
168
4743015d 169 ordered_hashmap_free_free(f->chain_cache);
a4bcff5b 170
d89c8fdf 171#if defined(HAVE_XZ) || defined(HAVE_LZ4)
807e17f0
LP
172 free(f->compress_buffer);
173#endif
174
7560fffc 175#ifdef HAVE_GCRYPT
baed47c3
LP
176 if (f->fss_file)
177 munmap(f->fss_file, PAGE_ALIGN(f->fss_file_size));
dc4ebc07 178 else
b7c9ae91
LP
179 free(f->fsprg_state);
180
181 free(f->fsprg_seed);
7560fffc
LP
182
183 if (f->hmac)
184 gcry_md_close(f->hmac);
185#endif
186
cec736d2 187 free(f);
804ae586 188 return NULL;
cec736d2
LP
189}
190
0ac38b70 191static int journal_file_init_header(JournalFile *f, JournalFile *template) {
d89c8fdf 192 Header h = {};
cec736d2
LP
193 ssize_t k;
194 int r;
195
196 assert(f);
197
7560fffc 198 memcpy(h.signature, HEADER_SIGNATURE, 8);
23b0b2b2 199 h.header_size = htole64(ALIGN64(sizeof(h)));
cec736d2 200
d89c8fdf
ZJS
201 h.incompatible_flags |= htole32(
202 f->compress_xz * HEADER_INCOMPATIBLE_COMPRESSED_XZ |
203 f->compress_lz4 * HEADER_INCOMPATIBLE_COMPRESSED_LZ4);
7560fffc 204
d89c8fdf
ZJS
205 h.compatible_flags = htole32(
206 f->seal * HEADER_COMPATIBLE_SEALED);
7560fffc 207
cec736d2
LP
208 r = sd_id128_randomize(&h.file_id);
209 if (r < 0)
210 return r;
211
0ac38b70
LP
212 if (template) {
213 h.seqnum_id = template->header->seqnum_id;
beec0085 214 h.tail_entry_seqnum = template->header->tail_entry_seqnum;
0ac38b70
LP
215 } else
216 h.seqnum_id = h.file_id;
cec736d2
LP
217
218 k = pwrite(f->fd, &h, sizeof(h), 0);
219 if (k < 0)
220 return -errno;
221
222 if (k != sizeof(h))
223 return -EIO;
224
225 return 0;
226}
227
228static int journal_file_refresh_header(JournalFile *f) {
de190aef 229 sd_id128_t boot_id;
fa6ac760 230 int r;
cec736d2
LP
231
232 assert(f);
233
234 r = sd_id128_get_machine(&f->header->machine_id);
235 if (r < 0)
236 return r;
237
de190aef 238 r = sd_id128_get_boot(&boot_id);
cec736d2
LP
239 if (r < 0)
240 return r;
241
de190aef
LP
242 if (sd_id128_equal(boot_id, f->header->boot_id))
243 f->tail_entry_monotonic_valid = true;
244
245 f->header->boot_id = boot_id;
246
fa6ac760 247 r = journal_file_set_online(f);
b788cc23 248
7560fffc 249 /* Sync the online state to disk */
a676e665 250 fsync(f->fd);
b788cc23 251
fa6ac760 252 return r;
cec736d2
LP
253}
254
255static int journal_file_verify_header(JournalFile *f) {
d89c8fdf
ZJS
256 uint32_t flags;
257
cec736d2
LP
258 assert(f);
259
7560fffc 260 if (memcmp(f->header->signature, HEADER_SIGNATURE, 8))
cec736d2
LP
261 return -EBADMSG;
262
7560fffc
LP
263 /* In both read and write mode we refuse to open files with
264 * incompatible flags we don't know */
d89c8fdf
ZJS
265 flags = le32toh(f->header->incompatible_flags);
266 if (flags & ~HEADER_INCOMPATIBLE_SUPPORTED) {
267 if (flags & ~HEADER_INCOMPATIBLE_ANY)
268 log_debug("Journal file %s has unknown incompatible flags %"PRIx32,
269 f->path, flags & ~HEADER_INCOMPATIBLE_ANY);
270 flags = (flags & HEADER_INCOMPATIBLE_ANY) & ~HEADER_INCOMPATIBLE_SUPPORTED;
271 if (flags)
272 log_debug("Journal file %s uses incompatible flags %"PRIx32
273 " disabled at compilation time.", f->path, flags);
cec736d2 274 return -EPROTONOSUPPORT;
d89c8fdf 275 }
cec736d2 276
7560fffc
LP
277 /* When open for writing we refuse to open files with
278 * compatible flags, too */
d89c8fdf
ZJS
279 flags = le32toh(f->header->compatible_flags);
280 if (f->writable && (flags & ~HEADER_COMPATIBLE_SUPPORTED)) {
281 if (flags & ~HEADER_COMPATIBLE_ANY)
282 log_debug("Journal file %s has unknown compatible flags %"PRIx32,
283 f->path, flags & ~HEADER_COMPATIBLE_ANY);
284 flags = (flags & HEADER_COMPATIBLE_ANY) & ~HEADER_COMPATIBLE_SUPPORTED;
285 if (flags)
286 log_debug("Journal file %s uses compatible flags %"PRIx32
287 " disabled at compilation time.", f->path, flags);
288 return -EPROTONOSUPPORT;
7560fffc
LP
289 }
290
db11ac1a
LP
291 if (f->header->state >= _STATE_MAX)
292 return -EBADMSG;
293
dca6219e
LP
294 /* The first addition was n_data, so check that we are at least this large */
295 if (le64toh(f->header->header_size) < HEADER_SIZE_MIN)
23b0b2b2
LP
296 return -EBADMSG;
297
8088cbd3 298 if (JOURNAL_HEADER_SEALED(f->header) && !JOURNAL_HEADER_CONTAINS(f->header, n_entry_arrays))
beec0085
LP
299 return -EBADMSG;
300
db11ac1a
LP
301 if ((le64toh(f->header->header_size) + le64toh(f->header->arena_size)) > (uint64_t) f->last_stat.st_size)
302 return -ENODATA;
303
304 if (le64toh(f->header->tail_object_offset) > (le64toh(f->header->header_size) + le64toh(f->header->arena_size)))
305 return -ENODATA;
306
7762e02b
LP
307 if (!VALID64(le64toh(f->header->data_hash_table_offset)) ||
308 !VALID64(le64toh(f->header->field_hash_table_offset)) ||
309 !VALID64(le64toh(f->header->tail_object_offset)) ||
310 !VALID64(le64toh(f->header->entry_array_offset)))
311 return -ENODATA;
312
cec736d2 313 if (f->writable) {
ccdbaf91 314 uint8_t state;
cec736d2
LP
315 sd_id128_t machine_id;
316 int r;
317
318 r = sd_id128_get_machine(&machine_id);
319 if (r < 0)
320 return r;
321
322 if (!sd_id128_equal(machine_id, f->header->machine_id))
323 return -EHOSTDOWN;
324
de190aef 325 state = f->header->state;
cec736d2 326
71fa6f00
LP
327 if (state == STATE_ONLINE) {
328 log_debug("Journal file %s is already online. Assuming unclean closing.", f->path);
329 return -EBUSY;
330 } else if (state == STATE_ARCHIVED)
cec736d2 331 return -ESHUTDOWN;
71fa6f00 332 else if (state != STATE_OFFLINE) {
8facc349 333 log_debug("Journal file %s has unknown state %i.", f->path, state);
71fa6f00
LP
334 return -EBUSY;
335 }
cec736d2
LP
336 }
337
d89c8fdf
ZJS
338 f->compress_xz = JOURNAL_HEADER_COMPRESSED_XZ(f->header);
339 f->compress_lz4 = JOURNAL_HEADER_COMPRESSED_LZ4(f->header);
c586dbf1 340
f1889c91 341 f->seal = JOURNAL_HEADER_SEALED(f->header);
7560fffc 342
cec736d2
LP
343 return 0;
344}
345
2678031a
LP
346static int journal_file_fstat(JournalFile *f) {
347 assert(f);
348 assert(f->fd >= 0);
349
350 if (fstat(f->fd, &f->last_stat) < 0)
351 return -errno;
352
353 f->last_stat_usec = now(CLOCK_MONOTONIC);
354
355 /* Refuse appending to files that are already deleted */
356 if (f->last_stat.st_nlink <= 0)
357 return -EIDRM;
358
359 return 0;
360}
361
cec736d2 362static int journal_file_allocate(JournalFile *f, uint64_t offset, uint64_t size) {
eda4b58b 363 uint64_t old_size, new_size;
fec2aa2f 364 int r;
cec736d2
LP
365
366 assert(f);
367
cec736d2 368 /* We assume that this file is not sparse, and we know that
38ac38b2 369 * for sure, since we always call posix_fallocate()
cec736d2
LP
370 * ourselves */
371
fa6ac760
LP
372 if (mmap_cache_got_sigbus(f->mmap, f->fd))
373 return -EIO;
374
cec736d2 375 old_size =
23b0b2b2 376 le64toh(f->header->header_size) +
cec736d2
LP
377 le64toh(f->header->arena_size);
378
bc85bfee 379 new_size = PAGE_ALIGN(offset + size);
23b0b2b2
LP
380 if (new_size < le64toh(f->header->header_size))
381 new_size = le64toh(f->header->header_size);
bc85bfee 382
2678031a
LP
383 if (new_size <= old_size) {
384
385 /* We already pre-allocated enough space, but before
386 * we write to it, let's check with fstat() if the
387 * file got deleted, in order make sure we don't throw
388 * away the data immediately. Don't check fstat() for
389 * all writes though, but only once ever 10s. */
390
391 if (f->last_stat_usec + LAST_STAT_REFRESH_USEC > now(CLOCK_MONOTONIC))
392 return 0;
393
394 return journal_file_fstat(f);
395 }
396
397 /* Allocate more space. */
cec736d2 398
a676e665 399 if (f->metrics.max_size > 0 && new_size > f->metrics.max_size)
bc85bfee 400 return -E2BIG;
cec736d2 401
a676e665 402 if (new_size > f->metrics.min_size && f->metrics.keep_free > 0) {
cec736d2
LP
403 struct statvfs svfs;
404
405 if (fstatvfs(f->fd, &svfs) >= 0) {
406 uint64_t available;
407
070052ab 408 available = LESS_BY((uint64_t) svfs.f_bfree * (uint64_t) svfs.f_bsize, f->metrics.keep_free);
cec736d2
LP
409
410 if (new_size - old_size > available)
411 return -E2BIG;
412 }
413 }
414
eda4b58b
LP
415 /* Increase by larger blocks at once */
416 new_size = ((new_size+FILE_SIZE_INCREASE-1) / FILE_SIZE_INCREASE) * FILE_SIZE_INCREASE;
417 if (f->metrics.max_size > 0 && new_size > f->metrics.max_size)
418 new_size = f->metrics.max_size;
419
bc85bfee
LP
420 /* Note that the glibc fallocate() fallback is very
421 inefficient, hence we try to minimize the allocation area
422 as we can. */
fec2aa2f
GV
423 r = posix_fallocate(f->fd, old_size, new_size - old_size);
424 if (r != 0)
425 return -r;
cec736d2 426
23b0b2b2 427 f->header->arena_size = htole64(new_size - le64toh(f->header->header_size));
cec736d2 428
2678031a 429 return journal_file_fstat(f);
cec736d2
LP
430}
431
78519831 432static unsigned type_to_context(ObjectType type) {
d3d3208f 433 /* One context for each type, plus one catch-all for the rest */
69adae51 434 assert_cc(_OBJECT_TYPE_MAX <= MMAP_CACHE_MAX_CONTEXTS);
fa6ac760 435 assert_cc(CONTEXT_HEADER < MMAP_CACHE_MAX_CONTEXTS);
d05089d8 436 return type > OBJECT_UNUSED && type < _OBJECT_TYPE_MAX ? type : 0;
d3d3208f
MS
437}
438
7a9dabea 439static int journal_file_move_to(JournalFile *f, ObjectType type, bool keep_always, uint64_t offset, uint64_t size, void **ret) {
2678031a
LP
440 int r;
441
cec736d2 442 assert(f);
cec736d2
LP
443 assert(ret);
444
7762e02b
LP
445 if (size <= 0)
446 return -EINVAL;
447
2a59ea54 448 /* Avoid SIGBUS on invalid accesses */
4bbdcdb3
LP
449 if (offset + size > (uint64_t) f->last_stat.st_size) {
450 /* Hmm, out of range? Let's refresh the fstat() data
451 * first, before we trust that check. */
452
2678031a
LP
453 r = journal_file_fstat(f);
454 if (r < 0)
455 return r;
456
457 if (offset + size > (uint64_t) f->last_stat.st_size)
4bbdcdb3
LP
458 return -EADDRNOTAVAIL;
459 }
460
7a9dabea 461 return mmap_cache_get(f->mmap, f->fd, f->prot, type_to_context(type), keep_always, offset, size, &f->last_stat, ret);
cec736d2
LP
462}
463
16e9f408
LP
464static uint64_t minimum_header_size(Object *o) {
465
b8e891e6 466 static const uint64_t table[] = {
16e9f408
LP
467 [OBJECT_DATA] = sizeof(DataObject),
468 [OBJECT_FIELD] = sizeof(FieldObject),
469 [OBJECT_ENTRY] = sizeof(EntryObject),
470 [OBJECT_DATA_HASH_TABLE] = sizeof(HashTableObject),
471 [OBJECT_FIELD_HASH_TABLE] = sizeof(HashTableObject),
472 [OBJECT_ENTRY_ARRAY] = sizeof(EntryArrayObject),
473 [OBJECT_TAG] = sizeof(TagObject),
474 };
475
476 if (o->object.type >= ELEMENTSOF(table) || table[o->object.type] <= 0)
477 return sizeof(ObjectHeader);
478
479 return table[o->object.type];
480}
481
78519831 482int journal_file_move_to_object(JournalFile *f, ObjectType type, uint64_t offset, Object **ret) {
cec736d2
LP
483 int r;
484 void *t;
485 Object *o;
486 uint64_t s;
487
488 assert(f);
489 assert(ret);
490
db11ac1a
LP
491 /* Objects may only be located at multiple of 64 bit */
492 if (!VALID64(offset))
493 return -EFAULT;
494
7a9dabea 495 r = journal_file_move_to(f, type, false, offset, sizeof(ObjectHeader), &t);
cec736d2
LP
496 if (r < 0)
497 return r;
498
499 o = (Object*) t;
500 s = le64toh(o->object.size);
501
502 if (s < sizeof(ObjectHeader))
503 return -EBADMSG;
504
16e9f408
LP
505 if (o->object.type <= OBJECT_UNUSED)
506 return -EBADMSG;
507
508 if (s < minimum_header_size(o))
509 return -EBADMSG;
510
d05089d8 511 if (type > OBJECT_UNUSED && o->object.type != type)
cec736d2
LP
512 return -EBADMSG;
513
514 if (s > sizeof(ObjectHeader)) {
7a9dabea 515 r = journal_file_move_to(f, type, false, offset, s, &t);
cec736d2
LP
516 if (r < 0)
517 return r;
518
519 o = (Object*) t;
520 }
521
cec736d2
LP
522 *ret = o;
523 return 0;
524}
525
d98cc1f2 526static uint64_t journal_file_entry_seqnum(JournalFile *f, uint64_t *seqnum) {
cec736d2
LP
527 uint64_t r;
528
529 assert(f);
530
beec0085 531 r = le64toh(f->header->tail_entry_seqnum) + 1;
c2373f84
LP
532
533 if (seqnum) {
de190aef 534 /* If an external seqnum counter was passed, we update
c2373f84
LP
535 * both the local and the external one, and set it to
536 * the maximum of both */
537
538 if (*seqnum + 1 > r)
539 r = *seqnum + 1;
540
541 *seqnum = r;
542 }
543
beec0085 544 f->header->tail_entry_seqnum = htole64(r);
cec736d2 545
beec0085
LP
546 if (f->header->head_entry_seqnum == 0)
547 f->header->head_entry_seqnum = htole64(r);
de190aef 548
cec736d2
LP
549 return r;
550}
551
78519831 552int journal_file_append_object(JournalFile *f, ObjectType type, uint64_t size, Object **ret, uint64_t *offset) {
cec736d2
LP
553 int r;
554 uint64_t p;
555 Object *tail, *o;
556 void *t;
557
558 assert(f);
d05089d8 559 assert(type > OBJECT_UNUSED && type < _OBJECT_TYPE_MAX);
cec736d2
LP
560 assert(size >= sizeof(ObjectHeader));
561 assert(offset);
562 assert(ret);
563
26687bf8
OS
564 r = journal_file_set_online(f);
565 if (r < 0)
566 return r;
567
cec736d2 568 p = le64toh(f->header->tail_object_offset);
cec736d2 569 if (p == 0)
23b0b2b2 570 p = le64toh(f->header->header_size);
cec736d2 571 else {
d05089d8 572 r = journal_file_move_to_object(f, OBJECT_UNUSED, p, &tail);
cec736d2
LP
573 if (r < 0)
574 return r;
575
576 p += ALIGN64(le64toh(tail->object.size));
577 }
578
579 r = journal_file_allocate(f, p, size);
580 if (r < 0)
581 return r;
582
fcde2389 583 r = journal_file_move_to(f, type, false, p, size, &t);
cec736d2
LP
584 if (r < 0)
585 return r;
586
587 o = (Object*) t;
588
589 zero(o->object);
de190aef 590 o->object.type = type;
cec736d2
LP
591 o->object.size = htole64(size);
592
593 f->header->tail_object_offset = htole64(p);
cec736d2
LP
594 f->header->n_objects = htole64(le64toh(f->header->n_objects) + 1);
595
596 *ret = o;
597 *offset = p;
598
599 return 0;
600}
601
de190aef 602static int journal_file_setup_data_hash_table(JournalFile *f) {
cec736d2
LP
603 uint64_t s, p;
604 Object *o;
605 int r;
606
607 assert(f);
608
070052ab
LP
609 /* We estimate that we need 1 hash table entry per 768 bytes
610 of journal file and we want to make sure we never get
611 beyond 75% fill level. Calculate the hash table size for
612 the maximum file size based on these metrics. */
4a92baf3 613
dfabe643 614 s = (f->metrics.max_size * 4 / 768 / 3) * sizeof(HashItem);
4a92baf3
LP
615 if (s < DEFAULT_DATA_HASH_TABLE_SIZE)
616 s = DEFAULT_DATA_HASH_TABLE_SIZE;
617
507f22bd 618 log_debug("Reserving %"PRIu64" entries in hash table.", s / sizeof(HashItem));
4a92baf3 619
de190aef
LP
620 r = journal_file_append_object(f,
621 OBJECT_DATA_HASH_TABLE,
622 offsetof(Object, hash_table.items) + s,
623 &o, &p);
cec736d2
LP
624 if (r < 0)
625 return r;
626
29804cc1 627 memzero(o->hash_table.items, s);
cec736d2 628
de190aef
LP
629 f->header->data_hash_table_offset = htole64(p + offsetof(Object, hash_table.items));
630 f->header->data_hash_table_size = htole64(s);
cec736d2
LP
631
632 return 0;
633}
634
de190aef 635static int journal_file_setup_field_hash_table(JournalFile *f) {
cec736d2
LP
636 uint64_t s, p;
637 Object *o;
638 int r;
639
640 assert(f);
641
3c1668da
LP
642 /* We use a fixed size hash table for the fields as this
643 * number should grow very slowly only */
644
de190aef
LP
645 s = DEFAULT_FIELD_HASH_TABLE_SIZE;
646 r = journal_file_append_object(f,
647 OBJECT_FIELD_HASH_TABLE,
648 offsetof(Object, hash_table.items) + s,
649 &o, &p);
cec736d2
LP
650 if (r < 0)
651 return r;
652
29804cc1 653 memzero(o->hash_table.items, s);
cec736d2 654
de190aef
LP
655 f->header->field_hash_table_offset = htole64(p + offsetof(Object, hash_table.items));
656 f->header->field_hash_table_size = htole64(s);
cec736d2
LP
657
658 return 0;
659}
660
dade37d4 661int journal_file_map_data_hash_table(JournalFile *f) {
cec736d2
LP
662 uint64_t s, p;
663 void *t;
664 int r;
665
666 assert(f);
667
dade37d4
LP
668 if (f->data_hash_table)
669 return 0;
670
de190aef
LP
671 p = le64toh(f->header->data_hash_table_offset);
672 s = le64toh(f->header->data_hash_table_size);
cec736d2 673
de190aef 674 r = journal_file_move_to(f,
16e9f408 675 OBJECT_DATA_HASH_TABLE,
fcde2389 676 true,
de190aef
LP
677 p, s,
678 &t);
cec736d2
LP
679 if (r < 0)
680 return r;
681
de190aef 682 f->data_hash_table = t;
cec736d2
LP
683 return 0;
684}
685
dade37d4 686int journal_file_map_field_hash_table(JournalFile *f) {
cec736d2
LP
687 uint64_t s, p;
688 void *t;
689 int r;
690
691 assert(f);
692
dade37d4
LP
693 if (f->field_hash_table)
694 return 0;
695
de190aef
LP
696 p = le64toh(f->header->field_hash_table_offset);
697 s = le64toh(f->header->field_hash_table_size);
cec736d2 698
de190aef 699 r = journal_file_move_to(f,
16e9f408 700 OBJECT_FIELD_HASH_TABLE,
fcde2389 701 true,
de190aef
LP
702 p, s,
703 &t);
cec736d2
LP
704 if (r < 0)
705 return r;
706
de190aef 707 f->field_hash_table = t;
cec736d2
LP
708 return 0;
709}
710
3c1668da
LP
711static int journal_file_link_field(
712 JournalFile *f,
713 Object *o,
714 uint64_t offset,
715 uint64_t hash) {
716
805d1486 717 uint64_t p, h, m;
3c1668da
LP
718 int r;
719
720 assert(f);
721 assert(o);
722 assert(offset > 0);
723
724 if (o->object.type != OBJECT_FIELD)
725 return -EINVAL;
726
805d1486
LP
727 m = le64toh(f->header->field_hash_table_size) / sizeof(HashItem);
728 if (m <= 0)
729 return -EBADMSG;
3c1668da 730
805d1486 731 /* This might alter the window we are looking at */
3c1668da
LP
732 o->field.next_hash_offset = o->field.head_data_offset = 0;
733
805d1486 734 h = hash % m;
3c1668da
LP
735 p = le64toh(f->field_hash_table[h].tail_hash_offset);
736 if (p == 0)
737 f->field_hash_table[h].head_hash_offset = htole64(offset);
738 else {
739 r = journal_file_move_to_object(f, OBJECT_FIELD, p, &o);
740 if (r < 0)
741 return r;
742
743 o->field.next_hash_offset = htole64(offset);
744 }
745
746 f->field_hash_table[h].tail_hash_offset = htole64(offset);
747
748 if (JOURNAL_HEADER_CONTAINS(f->header, n_fields))
749 f->header->n_fields = htole64(le64toh(f->header->n_fields) + 1);
750
751 return 0;
752}
753
754static int journal_file_link_data(
755 JournalFile *f,
756 Object *o,
757 uint64_t offset,
758 uint64_t hash) {
759
805d1486 760 uint64_t p, h, m;
cec736d2
LP
761 int r;
762
763 assert(f);
764 assert(o);
765 assert(offset > 0);
b588975f
LP
766
767 if (o->object.type != OBJECT_DATA)
768 return -EINVAL;
cec736d2 769
805d1486
LP
770 m = le64toh(f->header->data_hash_table_size) / sizeof(HashItem);
771 if (m <= 0)
772 return -EBADMSG;
48496df6 773
805d1486 774 /* This might alter the window we are looking at */
de190aef
LP
775 o->data.next_hash_offset = o->data.next_field_offset = 0;
776 o->data.entry_offset = o->data.entry_array_offset = 0;
777 o->data.n_entries = 0;
cec736d2 778
805d1486 779 h = hash % m;
8db4213e 780 p = le64toh(f->data_hash_table[h].tail_hash_offset);
3c1668da 781 if (p == 0)
cec736d2 782 /* Only entry in the hash table is easy */
de190aef 783 f->data_hash_table[h].head_hash_offset = htole64(offset);
3c1668da 784 else {
48496df6
LP
785 /* Move back to the previous data object, to patch in
786 * pointer */
cec736d2 787
de190aef 788 r = journal_file_move_to_object(f, OBJECT_DATA, p, &o);
cec736d2
LP
789 if (r < 0)
790 return r;
791
de190aef 792 o->data.next_hash_offset = htole64(offset);
cec736d2
LP
793 }
794
de190aef 795 f->data_hash_table[h].tail_hash_offset = htole64(offset);
cec736d2 796
dca6219e
LP
797 if (JOURNAL_HEADER_CONTAINS(f->header, n_data))
798 f->header->n_data = htole64(le64toh(f->header->n_data) + 1);
799
cec736d2
LP
800 return 0;
801}
802
3c1668da
LP
803int journal_file_find_field_object_with_hash(
804 JournalFile *f,
805 const void *field, uint64_t size, uint64_t hash,
806 Object **ret, uint64_t *offset) {
807
805d1486 808 uint64_t p, osize, h, m;
3c1668da
LP
809 int r;
810
811 assert(f);
812 assert(field && size > 0);
813
dade37d4
LP
814 /* If the field hash table is empty, we can't find anything */
815 if (le64toh(f->header->field_hash_table_size) <= 0)
816 return 0;
817
818 /* Map the field hash table, if it isn't mapped yet. */
819 r = journal_file_map_field_hash_table(f);
820 if (r < 0)
821 return r;
822
3c1668da
LP
823 osize = offsetof(Object, field.payload) + size;
824
805d1486 825 m = le64toh(f->header->field_hash_table_size) / sizeof(HashItem);
805d1486 826 if (m <= 0)
3c1668da
LP
827 return -EBADMSG;
828
805d1486 829 h = hash % m;
3c1668da
LP
830 p = le64toh(f->field_hash_table[h].head_hash_offset);
831
832 while (p > 0) {
833 Object *o;
834
835 r = journal_file_move_to_object(f, OBJECT_FIELD, p, &o);
836 if (r < 0)
837 return r;
838
839 if (le64toh(o->field.hash) == hash &&
840 le64toh(o->object.size) == osize &&
841 memcmp(o->field.payload, field, size) == 0) {
842
843 if (ret)
844 *ret = o;
845 if (offset)
846 *offset = p;
847
848 return 1;
849 }
850
851 p = le64toh(o->field.next_hash_offset);
852 }
853
854 return 0;
855}
856
857int journal_file_find_field_object(
858 JournalFile *f,
859 const void *field, uint64_t size,
860 Object **ret, uint64_t *offset) {
861
862 uint64_t hash;
863
864 assert(f);
865 assert(field && size > 0);
866
867 hash = hash64(field, size);
868
869 return journal_file_find_field_object_with_hash(f,
870 field, size, hash,
871 ret, offset);
872}
873
de190aef
LP
874int journal_file_find_data_object_with_hash(
875 JournalFile *f,
876 const void *data, uint64_t size, uint64_t hash,
877 Object **ret, uint64_t *offset) {
48496df6 878
805d1486 879 uint64_t p, osize, h, m;
cec736d2
LP
880 int r;
881
882 assert(f);
883 assert(data || size == 0);
884
dade37d4
LP
885 /* If there's no data hash table, then there's no entry. */
886 if (le64toh(f->header->data_hash_table_size) <= 0)
887 return 0;
888
889 /* Map the data hash table, if it isn't mapped yet. */
890 r = journal_file_map_data_hash_table(f);
891 if (r < 0)
892 return r;
893
cec736d2
LP
894 osize = offsetof(Object, data.payload) + size;
895
805d1486
LP
896 m = le64toh(f->header->data_hash_table_size) / sizeof(HashItem);
897 if (m <= 0)
bc85bfee
LP
898 return -EBADMSG;
899
805d1486 900 h = hash % m;
de190aef 901 p = le64toh(f->data_hash_table[h].head_hash_offset);
cec736d2 902
de190aef
LP
903 while (p > 0) {
904 Object *o;
cec736d2 905
de190aef 906 r = journal_file_move_to_object(f, OBJECT_DATA, p, &o);
cec736d2
LP
907 if (r < 0)
908 return r;
909
807e17f0 910 if (le64toh(o->data.hash) != hash)
85a131e8 911 goto next;
807e17f0 912
d89c8fdf 913 if (o->object.flags & OBJECT_COMPRESSION_MASK) {
3b1a55e1 914#if defined(HAVE_XZ) || defined(HAVE_LZ4)
fa1c4b51 915 uint64_t l;
a7f7d1bd 916 size_t rsize = 0;
cec736d2 917
807e17f0
LP
918 l = le64toh(o->object.size);
919 if (l <= offsetof(Object, data.payload))
cec736d2
LP
920 return -EBADMSG;
921
807e17f0
LP
922 l -= offsetof(Object, data.payload);
923
d89c8fdf
ZJS
924 r = decompress_blob(o->object.flags & OBJECT_COMPRESSION_MASK,
925 o->data.payload, l, &f->compress_buffer, &f->compress_buffer_size, &rsize, 0);
926 if (r < 0)
927 return r;
807e17f0 928
b785c858 929 if (rsize == size &&
807e17f0
LP
930 memcmp(f->compress_buffer, data, size) == 0) {
931
932 if (ret)
933 *ret = o;
934
935 if (offset)
936 *offset = p;
937
938 return 1;
939 }
3b1a55e1
ZJS
940#else
941 return -EPROTONOSUPPORT;
942#endif
807e17f0
LP
943 } else if (le64toh(o->object.size) == osize &&
944 memcmp(o->data.payload, data, size) == 0) {
945
cec736d2
LP
946 if (ret)
947 *ret = o;
948
949 if (offset)
950 *offset = p;
951
de190aef 952 return 1;
cec736d2
LP
953 }
954
85a131e8 955 next:
cec736d2
LP
956 p = le64toh(o->data.next_hash_offset);
957 }
958
de190aef
LP
959 return 0;
960}
961
962int journal_file_find_data_object(
963 JournalFile *f,
964 const void *data, uint64_t size,
965 Object **ret, uint64_t *offset) {
966
967 uint64_t hash;
968
969 assert(f);
970 assert(data || size == 0);
971
972 hash = hash64(data, size);
973
974 return journal_file_find_data_object_with_hash(f,
975 data, size, hash,
976 ret, offset);
977}
978
3c1668da
LP
979static int journal_file_append_field(
980 JournalFile *f,
981 const void *field, uint64_t size,
982 Object **ret, uint64_t *offset) {
983
984 uint64_t hash, p;
985 uint64_t osize;
986 Object *o;
987 int r;
988
989 assert(f);
990 assert(field && size > 0);
991
992 hash = hash64(field, size);
993
994 r = journal_file_find_field_object_with_hash(f, field, size, hash, &o, &p);
995 if (r < 0)
996 return r;
997 else if (r > 0) {
998
999 if (ret)
1000 *ret = o;
1001
1002 if (offset)
1003 *offset = p;
1004
1005 return 0;
1006 }
1007
1008 osize = offsetof(Object, field.payload) + size;
1009 r = journal_file_append_object(f, OBJECT_FIELD, osize, &o, &p);
8c92d4bb
LP
1010 if (r < 0)
1011 return r;
3c1668da
LP
1012
1013 o->field.hash = htole64(hash);
1014 memcpy(o->field.payload, field, size);
1015
1016 r = journal_file_link_field(f, o, p, hash);
1017 if (r < 0)
1018 return r;
1019
1020 /* The linking might have altered the window, so let's
1021 * refresh our pointer */
1022 r = journal_file_move_to_object(f, OBJECT_FIELD, p, &o);
1023 if (r < 0)
1024 return r;
1025
1026#ifdef HAVE_GCRYPT
1027 r = journal_file_hmac_put_object(f, OBJECT_FIELD, o, p);
1028 if (r < 0)
1029 return r;
1030#endif
1031
1032 if (ret)
1033 *ret = o;
1034
1035 if (offset)
1036 *offset = p;
1037
1038 return 0;
1039}
1040
48496df6
LP
1041static int journal_file_append_data(
1042 JournalFile *f,
1043 const void *data, uint64_t size,
1044 Object **ret, uint64_t *offset) {
1045
de190aef
LP
1046 uint64_t hash, p;
1047 uint64_t osize;
1048 Object *o;
d89c8fdf 1049 int r, compression = 0;
3c1668da 1050 const void *eq;
de190aef
LP
1051
1052 assert(f);
1053 assert(data || size == 0);
1054
1055 hash = hash64(data, size);
1056
1057 r = journal_file_find_data_object_with_hash(f, data, size, hash, &o, &p);
1058 if (r < 0)
1059 return r;
1060 else if (r > 0) {
1061
1062 if (ret)
1063 *ret = o;
1064
1065 if (offset)
1066 *offset = p;
1067
1068 return 0;
1069 }
1070
1071 osize = offsetof(Object, data.payload) + size;
1072 r = journal_file_append_object(f, OBJECT_DATA, osize, &o, &p);
cec736d2
LP
1073 if (r < 0)
1074 return r;
1075
cec736d2 1076 o->data.hash = htole64(hash);
807e17f0 1077
d89c8fdf 1078#if defined(HAVE_XZ) || defined(HAVE_LZ4)
d1afbcd2 1079 if (JOURNAL_FILE_COMPRESS(f) && size >= COMPRESSION_SIZE_THRESHOLD) {
a7f7d1bd 1080 size_t rsize = 0;
807e17f0 1081
d89c8fdf 1082 compression = compress_blob(data, size, o->data.payload, &rsize);
807e17f0 1083
d1afbcd2 1084 if (compression >= 0) {
807e17f0 1085 o->object.size = htole64(offsetof(Object, data.payload) + rsize);
d89c8fdf 1086 o->object.flags |= compression;
807e17f0 1087
fa1c4b51 1088 log_debug("Compressed data object %"PRIu64" -> %zu using %s",
d89c8fdf 1089 size, rsize, object_compressed_to_string(compression));
d1afbcd2
LP
1090 } else
1091 /* Compression didn't work, we don't really care why, let's continue without compression */
1092 compression = 0;
807e17f0
LP
1093 }
1094#endif
1095
d1afbcd2 1096 if (compression == 0 && size > 0)
807e17f0 1097 memcpy(o->data.payload, data, size);
cec736d2 1098
de190aef 1099 r = journal_file_link_data(f, o, p, hash);
cec736d2
LP
1100 if (r < 0)
1101 return r;
1102
48496df6
LP
1103 /* The linking might have altered the window, so let's
1104 * refresh our pointer */
1105 r = journal_file_move_to_object(f, OBJECT_DATA, p, &o);
1106 if (r < 0)
1107 return r;
1108
08c6f819
SL
1109 if (!data)
1110 eq = NULL;
1111 else
1112 eq = memchr(data, '=', size);
3c1668da 1113 if (eq && eq > data) {
748db592 1114 Object *fo = NULL;
3c1668da 1115 uint64_t fp;
3c1668da
LP
1116
1117 /* Create field object ... */
1118 r = journal_file_append_field(f, data, (uint8_t*) eq - (uint8_t*) data, &fo, &fp);
1119 if (r < 0)
1120 return r;
1121
1122 /* ... and link it in. */
1123 o->data.next_field_offset = fo->field.head_data_offset;
1124 fo->field.head_data_offset = le64toh(p);
1125 }
1126
5996c7c2
LP
1127#ifdef HAVE_GCRYPT
1128 r = journal_file_hmac_put_object(f, OBJECT_DATA, o, p);
1129 if (r < 0)
1130 return r;
1131#endif
1132
cec736d2
LP
1133 if (ret)
1134 *ret = o;
1135
1136 if (offset)
de190aef 1137 *offset = p;
cec736d2
LP
1138
1139 return 0;
1140}
1141
1142uint64_t journal_file_entry_n_items(Object *o) {
1143 assert(o);
b588975f
LP
1144
1145 if (o->object.type != OBJECT_ENTRY)
1146 return 0;
cec736d2
LP
1147
1148 return (le64toh(o->object.size) - offsetof(Object, entry.items)) / sizeof(EntryItem);
1149}
1150
0284adc6 1151uint64_t journal_file_entry_array_n_items(Object *o) {
de190aef 1152 assert(o);
b588975f
LP
1153
1154 if (o->object.type != OBJECT_ENTRY_ARRAY)
1155 return 0;
de190aef
LP
1156
1157 return (le64toh(o->object.size) - offsetof(Object, entry_array.items)) / sizeof(uint64_t);
1158}
1159
fb9a24b6
LP
1160uint64_t journal_file_hash_table_n_items(Object *o) {
1161 assert(o);
b588975f
LP
1162
1163 if (o->object.type != OBJECT_DATA_HASH_TABLE &&
1164 o->object.type != OBJECT_FIELD_HASH_TABLE)
1165 return 0;
fb9a24b6
LP
1166
1167 return (le64toh(o->object.size) - offsetof(Object, hash_table.items)) / sizeof(HashItem);
1168}
1169
de190aef 1170static int link_entry_into_array(JournalFile *f,
4fd052ae
FC
1171 le64_t *first,
1172 le64_t *idx,
de190aef 1173 uint64_t p) {
cec736d2 1174 int r;
de190aef
LP
1175 uint64_t n = 0, ap = 0, q, i, a, hidx;
1176 Object *o;
1177
cec736d2 1178 assert(f);
de190aef
LP
1179 assert(first);
1180 assert(idx);
1181 assert(p > 0);
cec736d2 1182
de190aef
LP
1183 a = le64toh(*first);
1184 i = hidx = le64toh(*idx);
1185 while (a > 0) {
1186
1187 r = journal_file_move_to_object(f, OBJECT_ENTRY_ARRAY, a, &o);
1188 if (r < 0)
1189 return r;
cec736d2 1190
de190aef
LP
1191 n = journal_file_entry_array_n_items(o);
1192 if (i < n) {
1193 o->entry_array.items[i] = htole64(p);
1194 *idx = htole64(hidx + 1);
1195 return 0;
1196 }
cec736d2 1197
de190aef
LP
1198 i -= n;
1199 ap = a;
1200 a = le64toh(o->entry_array.next_entry_array_offset);
1201 }
1202
1203 if (hidx > n)
1204 n = (hidx+1) * 2;
1205 else
1206 n = n * 2;
1207
1208 if (n < 4)
1209 n = 4;
1210
1211 r = journal_file_append_object(f, OBJECT_ENTRY_ARRAY,
1212 offsetof(Object, entry_array.items) + n * sizeof(uint64_t),
1213 &o, &q);
cec736d2
LP
1214 if (r < 0)
1215 return r;
1216
feb12d3e 1217#ifdef HAVE_GCRYPT
5996c7c2 1218 r = journal_file_hmac_put_object(f, OBJECT_ENTRY_ARRAY, o, q);
b0af6f41
LP
1219 if (r < 0)
1220 return r;
feb12d3e 1221#endif
b0af6f41 1222
de190aef 1223 o->entry_array.items[i] = htole64(p);
cec736d2 1224
de190aef 1225 if (ap == 0)
7be3aa17 1226 *first = htole64(q);
cec736d2 1227 else {
de190aef 1228 r = journal_file_move_to_object(f, OBJECT_ENTRY_ARRAY, ap, &o);
cec736d2
LP
1229 if (r < 0)
1230 return r;
1231
de190aef
LP
1232 o->entry_array.next_entry_array_offset = htole64(q);
1233 }
cec736d2 1234
2dee23eb
LP
1235 if (JOURNAL_HEADER_CONTAINS(f->header, n_entry_arrays))
1236 f->header->n_entry_arrays = htole64(le64toh(f->header->n_entry_arrays) + 1);
1237
de190aef
LP
1238 *idx = htole64(hidx + 1);
1239
1240 return 0;
1241}
cec736d2 1242
de190aef 1243static int link_entry_into_array_plus_one(JournalFile *f,
4fd052ae
FC
1244 le64_t *extra,
1245 le64_t *first,
1246 le64_t *idx,
de190aef
LP
1247 uint64_t p) {
1248
1249 int r;
1250
1251 assert(f);
1252 assert(extra);
1253 assert(first);
1254 assert(idx);
1255 assert(p > 0);
1256
1257 if (*idx == 0)
1258 *extra = htole64(p);
1259 else {
4fd052ae 1260 le64_t i;
de190aef 1261
7be3aa17 1262 i = htole64(le64toh(*idx) - 1);
de190aef
LP
1263 r = link_entry_into_array(f, first, &i, p);
1264 if (r < 0)
1265 return r;
cec736d2
LP
1266 }
1267
de190aef
LP
1268 *idx = htole64(le64toh(*idx) + 1);
1269 return 0;
1270}
1271
1272static int journal_file_link_entry_item(JournalFile *f, Object *o, uint64_t offset, uint64_t i) {
1273 uint64_t p;
1274 int r;
1275 assert(f);
1276 assert(o);
1277 assert(offset > 0);
1278
1279 p = le64toh(o->entry.items[i].object_offset);
1280 if (p == 0)
1281 return -EINVAL;
1282
1283 r = journal_file_move_to_object(f, OBJECT_DATA, p, &o);
cec736d2
LP
1284 if (r < 0)
1285 return r;
1286
de190aef
LP
1287 return link_entry_into_array_plus_one(f,
1288 &o->data.entry_offset,
1289 &o->data.entry_array_offset,
1290 &o->data.n_entries,
1291 offset);
cec736d2
LP
1292}
1293
1294static int journal_file_link_entry(JournalFile *f, Object *o, uint64_t offset) {
de190aef 1295 uint64_t n, i;
cec736d2
LP
1296 int r;
1297
1298 assert(f);
1299 assert(o);
1300 assert(offset > 0);
b588975f
LP
1301
1302 if (o->object.type != OBJECT_ENTRY)
1303 return -EINVAL;
cec736d2 1304
b788cc23
LP
1305 __sync_synchronize();
1306
cec736d2 1307 /* Link up the entry itself */
de190aef
LP
1308 r = link_entry_into_array(f,
1309 &f->header->entry_array_offset,
1310 &f->header->n_entries,
1311 offset);
1312 if (r < 0)
1313 return r;
cec736d2 1314
507f22bd 1315 /* log_debug("=> %s seqnr=%"PRIu64" n_entries=%"PRIu64, f->path, o->entry.seqnum, f->header->n_entries); */
cec736d2 1316
de190aef 1317 if (f->header->head_entry_realtime == 0)
0ac38b70 1318 f->header->head_entry_realtime = o->entry.realtime;
cec736d2 1319
0ac38b70 1320 f->header->tail_entry_realtime = o->entry.realtime;
de190aef
LP
1321 f->header->tail_entry_monotonic = o->entry.monotonic;
1322
1323 f->tail_entry_monotonic_valid = true;
cec736d2
LP
1324
1325 /* Link up the items */
1326 n = journal_file_entry_n_items(o);
1327 for (i = 0; i < n; i++) {
1328 r = journal_file_link_entry_item(f, o, offset, i);
1329 if (r < 0)
1330 return r;
1331 }
1332
cec736d2
LP
1333 return 0;
1334}
1335
1336static int journal_file_append_entry_internal(
1337 JournalFile *f,
1338 const dual_timestamp *ts,
1339 uint64_t xor_hash,
1340 const EntryItem items[], unsigned n_items,
de190aef 1341 uint64_t *seqnum,
cec736d2
LP
1342 Object **ret, uint64_t *offset) {
1343 uint64_t np;
1344 uint64_t osize;
1345 Object *o;
1346 int r;
1347
1348 assert(f);
1349 assert(items || n_items == 0);
de190aef 1350 assert(ts);
cec736d2
LP
1351
1352 osize = offsetof(Object, entry.items) + (n_items * sizeof(EntryItem));
1353
de190aef 1354 r = journal_file_append_object(f, OBJECT_ENTRY, osize, &o, &np);
cec736d2
LP
1355 if (r < 0)
1356 return r;
1357
d98cc1f2 1358 o->entry.seqnum = htole64(journal_file_entry_seqnum(f, seqnum));
cec736d2 1359 memcpy(o->entry.items, items, n_items * sizeof(EntryItem));
de190aef
LP
1360 o->entry.realtime = htole64(ts->realtime);
1361 o->entry.monotonic = htole64(ts->monotonic);
cec736d2
LP
1362 o->entry.xor_hash = htole64(xor_hash);
1363 o->entry.boot_id = f->header->boot_id;
1364
feb12d3e 1365#ifdef HAVE_GCRYPT
5996c7c2 1366 r = journal_file_hmac_put_object(f, OBJECT_ENTRY, o, np);
b0af6f41
LP
1367 if (r < 0)
1368 return r;
feb12d3e 1369#endif
b0af6f41 1370
cec736d2
LP
1371 r = journal_file_link_entry(f, o, np);
1372 if (r < 0)
1373 return r;
1374
1375 if (ret)
1376 *ret = o;
1377
1378 if (offset)
1379 *offset = np;
1380
1381 return 0;
1382}
1383
cf244689 1384void journal_file_post_change(JournalFile *f) {
50f20cfd
LP
1385 assert(f);
1386
1387 /* inotify() does not receive IN_MODIFY events from file
1388 * accesses done via mmap(). After each access we hence
1389 * trigger IN_MODIFY by truncating the journal file to its
1390 * current size which triggers IN_MODIFY. */
1391
bc85bfee
LP
1392 __sync_synchronize();
1393
50f20cfd 1394 if (ftruncate(f->fd, f->last_stat.st_size) < 0)
56f64d95 1395 log_error_errno(errno, "Failed to truncate file to its own size: %m");
50f20cfd
LP
1396}
1397
1f2da9ec
LP
1398static int entry_item_cmp(const void *_a, const void *_b) {
1399 const EntryItem *a = _a, *b = _b;
1400
1401 if (le64toh(a->object_offset) < le64toh(b->object_offset))
1402 return -1;
1403 if (le64toh(a->object_offset) > le64toh(b->object_offset))
1404 return 1;
1405 return 0;
1406}
1407
de190aef 1408int journal_file_append_entry(JournalFile *f, const dual_timestamp *ts, const struct iovec iovec[], unsigned n_iovec, uint64_t *seqnum, Object **ret, uint64_t *offset) {
cec736d2
LP
1409 unsigned i;
1410 EntryItem *items;
1411 int r;
1412 uint64_t xor_hash = 0;
de190aef 1413 struct dual_timestamp _ts;
cec736d2
LP
1414
1415 assert(f);
1416 assert(iovec || n_iovec == 0);
1417
de190aef
LP
1418 if (!ts) {
1419 dual_timestamp_get(&_ts);
1420 ts = &_ts;
1421 }
1422
1423 if (f->tail_entry_monotonic_valid &&
1424 ts->monotonic < le64toh(f->header->tail_entry_monotonic))
1425 return -EINVAL;
1426
feb12d3e 1427#ifdef HAVE_GCRYPT
7560fffc
LP
1428 r = journal_file_maybe_append_tag(f, ts->realtime);
1429 if (r < 0)
1430 return r;
feb12d3e 1431#endif
7560fffc 1432
64825d3c 1433 /* alloca() can't take 0, hence let's allocate at least one */
9607d947 1434 items = alloca(sizeof(EntryItem) * MAX(1u, n_iovec));
cec736d2
LP
1435
1436 for (i = 0; i < n_iovec; i++) {
1437 uint64_t p;
1438 Object *o;
1439
1440 r = journal_file_append_data(f, iovec[i].iov_base, iovec[i].iov_len, &o, &p);
1441 if (r < 0)
cf244689 1442 return r;
cec736d2
LP
1443
1444 xor_hash ^= le64toh(o->data.hash);
1445 items[i].object_offset = htole64(p);
de7b95cd 1446 items[i].hash = o->data.hash;
cec736d2
LP
1447 }
1448
1f2da9ec
LP
1449 /* Order by the position on disk, in order to improve seek
1450 * times for rotating media. */
7ff7394d 1451 qsort_safe(items, n_iovec, sizeof(EntryItem), entry_item_cmp);
1f2da9ec 1452
de190aef 1453 r = journal_file_append_entry_internal(f, ts, xor_hash, items, n_iovec, seqnum, ret, offset);
cec736d2 1454
fa6ac760
LP
1455 /* If the memory mapping triggered a SIGBUS then we return an
1456 * IO error and ignore the error code passed down to us, since
1457 * it is very likely just an effect of a nullified replacement
1458 * mapping page */
1459
1460 if (mmap_cache_got_sigbus(f->mmap, f->fd))
1461 r = -EIO;
1462
50f20cfd
LP
1463 journal_file_post_change(f);
1464
cec736d2
LP
1465 return r;
1466}
1467
a4bcff5b 1468typedef struct ChainCacheItem {
fb099c8d 1469 uint64_t first; /* the array at the beginning of the chain */
a4bcff5b
LP
1470 uint64_t array; /* the cached array */
1471 uint64_t begin; /* the first item in the cached array */
1472 uint64_t total; /* the total number of items in all arrays before this one in the chain */
f268980d 1473 uint64_t last_index; /* the last index we looked at, to optimize locality when bisecting */
a4bcff5b
LP
1474} ChainCacheItem;
1475
1476static void chain_cache_put(
4743015d 1477 OrderedHashmap *h,
a4bcff5b
LP
1478 ChainCacheItem *ci,
1479 uint64_t first,
1480 uint64_t array,
1481 uint64_t begin,
f268980d
LP
1482 uint64_t total,
1483 uint64_t last_index) {
a4bcff5b
LP
1484
1485 if (!ci) {
34741aa3
LP
1486 /* If the chain item to cache for this chain is the
1487 * first one it's not worth caching anything */
1488 if (array == first)
1489 return;
1490
29433089 1491 if (ordered_hashmap_size(h) >= CHAIN_CACHE_MAX) {
4743015d 1492 ci = ordered_hashmap_steal_first(h);
29433089
LP
1493 assert(ci);
1494 } else {
a4bcff5b
LP
1495 ci = new(ChainCacheItem, 1);
1496 if (!ci)
1497 return;
1498 }
1499
1500 ci->first = first;
1501
4743015d 1502 if (ordered_hashmap_put(h, &ci->first, ci) < 0) {
a4bcff5b
LP
1503 free(ci);
1504 return;
1505 }
1506 } else
1507 assert(ci->first == first);
1508
1509 ci->array = array;
1510 ci->begin = begin;
1511 ci->total = total;
f268980d 1512 ci->last_index = last_index;
a4bcff5b
LP
1513}
1514
f268980d
LP
1515static int generic_array_get(
1516 JournalFile *f,
1517 uint64_t first,
1518 uint64_t i,
1519 Object **ret, uint64_t *offset) {
de190aef 1520
cec736d2 1521 Object *o;
a4bcff5b 1522 uint64_t p = 0, a, t = 0;
cec736d2 1523 int r;
a4bcff5b 1524 ChainCacheItem *ci;
cec736d2
LP
1525
1526 assert(f);
1527
de190aef 1528 a = first;
a4bcff5b
LP
1529
1530 /* Try the chain cache first */
4743015d 1531 ci = ordered_hashmap_get(f->chain_cache, &first);
a4bcff5b
LP
1532 if (ci && i > ci->total) {
1533 a = ci->array;
1534 i -= ci->total;
1535 t = ci->total;
1536 }
1537
de190aef 1538 while (a > 0) {
a4bcff5b 1539 uint64_t k;
cec736d2 1540
de190aef
LP
1541 r = journal_file_move_to_object(f, OBJECT_ENTRY_ARRAY, a, &o);
1542 if (r < 0)
1543 return r;
cec736d2 1544
a4bcff5b
LP
1545 k = journal_file_entry_array_n_items(o);
1546 if (i < k) {
de190aef 1547 p = le64toh(o->entry_array.items[i]);
a4bcff5b 1548 goto found;
cec736d2
LP
1549 }
1550
a4bcff5b
LP
1551 i -= k;
1552 t += k;
de190aef
LP
1553 a = le64toh(o->entry_array.next_entry_array_offset);
1554 }
1555
a4bcff5b
LP
1556 return 0;
1557
1558found:
1559 /* Let's cache this item for the next invocation */
af13a6b0 1560 chain_cache_put(f->chain_cache, ci, first, a, le64toh(o->entry_array.items[0]), t, i);
de190aef
LP
1561
1562 r = journal_file_move_to_object(f, OBJECT_ENTRY, p, &o);
1563 if (r < 0)
1564 return r;
1565
1566 if (ret)
1567 *ret = o;
1568
1569 if (offset)
1570 *offset = p;
1571
1572 return 1;
1573}
1574
f268980d
LP
1575static int generic_array_get_plus_one(
1576 JournalFile *f,
1577 uint64_t extra,
1578 uint64_t first,
1579 uint64_t i,
1580 Object **ret, uint64_t *offset) {
de190aef
LP
1581
1582 Object *o;
1583
1584 assert(f);
1585
1586 if (i == 0) {
1587 int r;
1588
1589 r = journal_file_move_to_object(f, OBJECT_ENTRY, extra, &o);
cec736d2
LP
1590 if (r < 0)
1591 return r;
1592
de190aef
LP
1593 if (ret)
1594 *ret = o;
cec736d2 1595
de190aef
LP
1596 if (offset)
1597 *offset = extra;
cec736d2 1598
de190aef 1599 return 1;
cec736d2
LP
1600 }
1601
de190aef
LP
1602 return generic_array_get(f, first, i-1, ret, offset);
1603}
cec736d2 1604
de190aef
LP
1605enum {
1606 TEST_FOUND,
1607 TEST_LEFT,
1608 TEST_RIGHT
1609};
cec736d2 1610
f268980d
LP
1611static int generic_array_bisect(
1612 JournalFile *f,
1613 uint64_t first,
1614 uint64_t n,
1615 uint64_t needle,
1616 int (*test_object)(JournalFile *f, uint64_t p, uint64_t needle),
1617 direction_t direction,
1618 Object **ret,
1619 uint64_t *offset,
1620 uint64_t *idx) {
1621
1622 uint64_t a, p, t = 0, i = 0, last_p = 0, last_index = (uint64_t) -1;
de190aef
LP
1623 bool subtract_one = false;
1624 Object *o, *array = NULL;
1625 int r;
a4bcff5b 1626 ChainCacheItem *ci;
cec736d2 1627
de190aef
LP
1628 assert(f);
1629 assert(test_object);
cec736d2 1630
a4bcff5b 1631 /* Start with the first array in the chain */
de190aef 1632 a = first;
a4bcff5b 1633
4743015d 1634 ci = ordered_hashmap_get(f->chain_cache, &first);
a4bcff5b
LP
1635 if (ci && n > ci->total) {
1636 /* Ah, we have iterated this bisection array chain
1637 * previously! Let's see if we can skip ahead in the
1638 * chain, as far as the last time. But we can't jump
1639 * backwards in the chain, so let's check that
1640 * first. */
1641
1642 r = test_object(f, ci->begin, needle);
1643 if (r < 0)
1644 return r;
1645
1646 if (r == TEST_LEFT) {
f268980d 1647 /* OK, what we are looking for is right of the
a4bcff5b
LP
1648 * begin of this EntryArray, so let's jump
1649 * straight to previously cached array in the
1650 * chain */
1651
1652 a = ci->array;
1653 n -= ci->total;
1654 t = ci->total;
f268980d 1655 last_index = ci->last_index;
a4bcff5b
LP
1656 }
1657 }
1658
de190aef
LP
1659 while (a > 0) {
1660 uint64_t left, right, k, lp;
1661
1662 r = journal_file_move_to_object(f, OBJECT_ENTRY_ARRAY, a, &array);
cec736d2
LP
1663 if (r < 0)
1664 return r;
1665
de190aef
LP
1666 k = journal_file_entry_array_n_items(array);
1667 right = MIN(k, n);
1668 if (right <= 0)
1669 return 0;
cec736d2 1670
de190aef
LP
1671 i = right - 1;
1672 lp = p = le64toh(array->entry_array.items[i]);
1673 if (p <= 0)
1674 return -EBADMSG;
cec736d2 1675
de190aef
LP
1676 r = test_object(f, p, needle);
1677 if (r < 0)
1678 return r;
cec736d2 1679
de190aef
LP
1680 if (r == TEST_FOUND)
1681 r = direction == DIRECTION_DOWN ? TEST_RIGHT : TEST_LEFT;
1682
1683 if (r == TEST_RIGHT) {
1684 left = 0;
1685 right -= 1;
f268980d
LP
1686
1687 if (last_index != (uint64_t) -1) {
1688 assert(last_index <= right);
1689
1690 /* If we cached the last index we
1691 * looked at, let's try to not to jump
1692 * too wildly around and see if we can
1693 * limit the range to look at early to
1694 * the immediate neighbors of the last
1695 * index we looked at. */
1696
1697 if (last_index > 0) {
1698 uint64_t x = last_index - 1;
1699
1700 p = le64toh(array->entry_array.items[x]);
1701 if (p <= 0)
1702 return -EBADMSG;
1703
1704 r = test_object(f, p, needle);
1705 if (r < 0)
1706 return r;
1707
1708 if (r == TEST_FOUND)
1709 r = direction == DIRECTION_DOWN ? TEST_RIGHT : TEST_LEFT;
1710
1711 if (r == TEST_RIGHT)
1712 right = x;
1713 else
1714 left = x + 1;
1715 }
1716
1717 if (last_index < right) {
1718 uint64_t y = last_index + 1;
1719
1720 p = le64toh(array->entry_array.items[y]);
1721 if (p <= 0)
1722 return -EBADMSG;
1723
1724 r = test_object(f, p, needle);
1725 if (r < 0)
1726 return r;
1727
1728 if (r == TEST_FOUND)
1729 r = direction == DIRECTION_DOWN ? TEST_RIGHT : TEST_LEFT;
1730
1731 if (r == TEST_RIGHT)
1732 right = y;
1733 else
1734 left = y + 1;
1735 }
f268980d
LP
1736 }
1737
de190aef
LP
1738 for (;;) {
1739 if (left == right) {
1740 if (direction == DIRECTION_UP)
1741 subtract_one = true;
1742
1743 i = left;
1744 goto found;
1745 }
1746
1747 assert(left < right);
de190aef 1748 i = (left + right) / 2;
f268980d 1749
de190aef
LP
1750 p = le64toh(array->entry_array.items[i]);
1751 if (p <= 0)
1752 return -EBADMSG;
1753
1754 r = test_object(f, p, needle);
1755 if (r < 0)
1756 return r;
cec736d2 1757
de190aef
LP
1758 if (r == TEST_FOUND)
1759 r = direction == DIRECTION_DOWN ? TEST_RIGHT : TEST_LEFT;
1760
1761 if (r == TEST_RIGHT)
1762 right = i;
1763 else
1764 left = i + 1;
1765 }
1766 }
1767
2173cbf8 1768 if (k >= n) {
cbdca852
LP
1769 if (direction == DIRECTION_UP) {
1770 i = n;
1771 subtract_one = true;
1772 goto found;
1773 }
1774
cec736d2 1775 return 0;
cbdca852 1776 }
cec736d2 1777
de190aef
LP
1778 last_p = lp;
1779
1780 n -= k;
1781 t += k;
f268980d 1782 last_index = (uint64_t) -1;
de190aef 1783 a = le64toh(array->entry_array.next_entry_array_offset);
cec736d2
LP
1784 }
1785
1786 return 0;
de190aef
LP
1787
1788found:
1789 if (subtract_one && t == 0 && i == 0)
1790 return 0;
1791
a4bcff5b 1792 /* Let's cache this item for the next invocation */
af13a6b0 1793 chain_cache_put(f->chain_cache, ci, first, a, le64toh(array->entry_array.items[0]), t, subtract_one ? (i > 0 ? i-1 : (uint64_t) -1) : i);
a4bcff5b 1794
de190aef
LP
1795 if (subtract_one && i == 0)
1796 p = last_p;
1797 else if (subtract_one)
1798 p = le64toh(array->entry_array.items[i-1]);
1799 else
1800 p = le64toh(array->entry_array.items[i]);
1801
1802 r = journal_file_move_to_object(f, OBJECT_ENTRY, p, &o);
1803 if (r < 0)
1804 return r;
1805
1806 if (ret)
1807 *ret = o;
1808
1809 if (offset)
1810 *offset = p;
1811
1812 if (idx)
cbdca852 1813 *idx = t + i + (subtract_one ? -1 : 0);
de190aef
LP
1814
1815 return 1;
cec736d2
LP
1816}
1817
f268980d
LP
1818static int generic_array_bisect_plus_one(
1819 JournalFile *f,
1820 uint64_t extra,
1821 uint64_t first,
1822 uint64_t n,
1823 uint64_t needle,
1824 int (*test_object)(JournalFile *f, uint64_t p, uint64_t needle),
1825 direction_t direction,
1826 Object **ret,
1827 uint64_t *offset,
1828 uint64_t *idx) {
de190aef 1829
cec736d2 1830 int r;
cbdca852
LP
1831 bool step_back = false;
1832 Object *o;
cec736d2
LP
1833
1834 assert(f);
de190aef 1835 assert(test_object);
cec736d2 1836
de190aef
LP
1837 if (n <= 0)
1838 return 0;
cec736d2 1839
de190aef
LP
1840 /* This bisects the array in object 'first', but first checks
1841 * an extra */
de190aef
LP
1842 r = test_object(f, extra, needle);
1843 if (r < 0)
1844 return r;
a536e261
LP
1845
1846 if (r == TEST_FOUND)
1847 r = direction == DIRECTION_DOWN ? TEST_RIGHT : TEST_LEFT;
1848
cbdca852
LP
1849 /* if we are looking with DIRECTION_UP then we need to first
1850 see if in the actual array there is a matching entry, and
1851 return the last one of that. But if there isn't any we need
1852 to return this one. Hence remember this, and return it
1853 below. */
1854 if (r == TEST_LEFT)
1855 step_back = direction == DIRECTION_UP;
de190aef 1856
cbdca852
LP
1857 if (r == TEST_RIGHT) {
1858 if (direction == DIRECTION_DOWN)
1859 goto found;
1860 else
1861 return 0;
a536e261 1862 }
cec736d2 1863
de190aef
LP
1864 r = generic_array_bisect(f, first, n-1, needle, test_object, direction, ret, offset, idx);
1865
cbdca852
LP
1866 if (r == 0 && step_back)
1867 goto found;
1868
ecf68b1d 1869 if (r > 0 && idx)
de190aef
LP
1870 (*idx) ++;
1871
1872 return r;
cbdca852
LP
1873
1874found:
1875 r = journal_file_move_to_object(f, OBJECT_ENTRY, extra, &o);
1876 if (r < 0)
1877 return r;
1878
1879 if (ret)
1880 *ret = o;
1881
1882 if (offset)
1883 *offset = extra;
1884
1885 if (idx)
1886 *idx = 0;
1887
1888 return 1;
1889}
1890
44a6b1b6 1891_pure_ static int test_object_offset(JournalFile *f, uint64_t p, uint64_t needle) {
cbdca852
LP
1892 assert(f);
1893 assert(p > 0);
1894
1895 if (p == needle)
1896 return TEST_FOUND;
1897 else if (p < needle)
1898 return TEST_LEFT;
1899 else
1900 return TEST_RIGHT;
1901}
1902
de190aef
LP
1903static int test_object_seqnum(JournalFile *f, uint64_t p, uint64_t needle) {
1904 Object *o;
1905 int r;
1906
1907 assert(f);
1908 assert(p > 0);
1909
1910 r = journal_file_move_to_object(f, OBJECT_ENTRY, p, &o);
cec736d2
LP
1911 if (r < 0)
1912 return r;
1913
de190aef
LP
1914 if (le64toh(o->entry.seqnum) == needle)
1915 return TEST_FOUND;
1916 else if (le64toh(o->entry.seqnum) < needle)
1917 return TEST_LEFT;
1918 else
1919 return TEST_RIGHT;
1920}
cec736d2 1921
de190aef
LP
1922int journal_file_move_to_entry_by_seqnum(
1923 JournalFile *f,
1924 uint64_t seqnum,
1925 direction_t direction,
1926 Object **ret,
1927 uint64_t *offset) {
1928
1929 return generic_array_bisect(f,
1930 le64toh(f->header->entry_array_offset),
1931 le64toh(f->header->n_entries),
1932 seqnum,
1933 test_object_seqnum,
1934 direction,
1935 ret, offset, NULL);
1936}
cec736d2 1937
de190aef
LP
1938static int test_object_realtime(JournalFile *f, uint64_t p, uint64_t needle) {
1939 Object *o;
1940 int r;
1941
1942 assert(f);
1943 assert(p > 0);
1944
1945 r = journal_file_move_to_object(f, OBJECT_ENTRY, p, &o);
1946 if (r < 0)
1947 return r;
1948
1949 if (le64toh(o->entry.realtime) == needle)
1950 return TEST_FOUND;
1951 else if (le64toh(o->entry.realtime) < needle)
1952 return TEST_LEFT;
1953 else
1954 return TEST_RIGHT;
cec736d2
LP
1955}
1956
de190aef
LP
1957int journal_file_move_to_entry_by_realtime(
1958 JournalFile *f,
1959 uint64_t realtime,
1960 direction_t direction,
1961 Object **ret,
1962 uint64_t *offset) {
1963
1964 return generic_array_bisect(f,
1965 le64toh(f->header->entry_array_offset),
1966 le64toh(f->header->n_entries),
1967 realtime,
1968 test_object_realtime,
1969 direction,
1970 ret, offset, NULL);
1971}
1972
1973static int test_object_monotonic(JournalFile *f, uint64_t p, uint64_t needle) {
1974 Object *o;
1975 int r;
1976
1977 assert(f);
1978 assert(p > 0);
1979
1980 r = journal_file_move_to_object(f, OBJECT_ENTRY, p, &o);
1981 if (r < 0)
1982 return r;
1983
1984 if (le64toh(o->entry.monotonic) == needle)
1985 return TEST_FOUND;
1986 else if (le64toh(o->entry.monotonic) < needle)
1987 return TEST_LEFT;
1988 else
1989 return TEST_RIGHT;
1990}
1991
2a560338 1992static int find_data_object_by_boot_id(
47838ab3
ZJS
1993 JournalFile *f,
1994 sd_id128_t boot_id,
1995 Object **o,
1996 uint64_t *b) {
2a560338 1997
47838ab3
ZJS
1998 char t[sizeof("_BOOT_ID=")-1 + 32 + 1] = "_BOOT_ID=";
1999
2000 sd_id128_to_string(boot_id, t + 9);
2001 return journal_file_find_data_object(f, t, sizeof(t) - 1, o, b);
2002}
2003
de190aef
LP
2004int journal_file_move_to_entry_by_monotonic(
2005 JournalFile *f,
2006 sd_id128_t boot_id,
2007 uint64_t monotonic,
2008 direction_t direction,
2009 Object **ret,
2010 uint64_t *offset) {
2011
de190aef
LP
2012 Object *o;
2013 int r;
2014
cbdca852 2015 assert(f);
de190aef 2016
47838ab3 2017 r = find_data_object_by_boot_id(f, boot_id, &o, NULL);
de190aef
LP
2018 if (r < 0)
2019 return r;
cbdca852 2020 if (r == 0)
de190aef
LP
2021 return -ENOENT;
2022
2023 return generic_array_bisect_plus_one(f,
2024 le64toh(o->data.entry_offset),
2025 le64toh(o->data.entry_array_offset),
2026 le64toh(o->data.n_entries),
2027 monotonic,
2028 test_object_monotonic,
2029 direction,
2030 ret, offset, NULL);
2031}
2032
1fc605b0 2033void journal_file_reset_location(JournalFile *f) {
6573ef05 2034 f->location_type = LOCATION_HEAD;
1fc605b0 2035 f->current_offset = 0;
6573ef05
MS
2036 f->current_seqnum = 0;
2037 f->current_realtime = 0;
2038 f->current_monotonic = 0;
2039 zero(f->current_boot_id);
2040 f->current_xor_hash = 0;
2041}
2042
950c07d4 2043void journal_file_save_location(JournalFile *f, Object *o, uint64_t offset) {
6573ef05
MS
2044 f->location_type = LOCATION_SEEK;
2045 f->current_offset = offset;
2046 f->current_seqnum = le64toh(o->entry.seqnum);
2047 f->current_realtime = le64toh(o->entry.realtime);
2048 f->current_monotonic = le64toh(o->entry.monotonic);
2049 f->current_boot_id = o->entry.boot_id;
2050 f->current_xor_hash = le64toh(o->entry.xor_hash);
1fc605b0
MS
2051}
2052
d8ae66d7
MS
2053int journal_file_compare_locations(JournalFile *af, JournalFile *bf) {
2054 assert(af);
2055 assert(bf);
2056 assert(af->location_type == LOCATION_SEEK);
2057 assert(bf->location_type == LOCATION_SEEK);
2058
2059 /* If contents and timestamps match, these entries are
2060 * identical, even if the seqnum does not match */
2061 if (sd_id128_equal(af->current_boot_id, bf->current_boot_id) &&
2062 af->current_monotonic == bf->current_monotonic &&
2063 af->current_realtime == bf->current_realtime &&
2064 af->current_xor_hash == bf->current_xor_hash)
2065 return 0;
2066
2067 if (sd_id128_equal(af->header->seqnum_id, bf->header->seqnum_id)) {
2068
2069 /* If this is from the same seqnum source, compare
2070 * seqnums */
2071 if (af->current_seqnum < bf->current_seqnum)
2072 return -1;
2073 if (af->current_seqnum > bf->current_seqnum)
2074 return 1;
2075
2076 /* Wow! This is weird, different data but the same
2077 * seqnums? Something is borked, but let's make the
2078 * best of it and compare by time. */
2079 }
2080
2081 if (sd_id128_equal(af->current_boot_id, bf->current_boot_id)) {
2082
2083 /* If the boot id matches, compare monotonic time */
2084 if (af->current_monotonic < bf->current_monotonic)
2085 return -1;
2086 if (af->current_monotonic > bf->current_monotonic)
2087 return 1;
2088 }
2089
2090 /* Otherwise, compare UTC time */
2091 if (af->current_realtime < bf->current_realtime)
2092 return -1;
2093 if (af->current_realtime > bf->current_realtime)
2094 return 1;
2095
2096 /* Finally, compare by contents */
2097 if (af->current_xor_hash < bf->current_xor_hash)
2098 return -1;
2099 if (af->current_xor_hash > bf->current_xor_hash)
2100 return 1;
2101
2102 return 0;
2103}
2104
de190aef
LP
2105int journal_file_next_entry(
2106 JournalFile *f,
f534928a 2107 uint64_t p,
de190aef
LP
2108 direction_t direction,
2109 Object **ret, uint64_t *offset) {
2110
fb099c8d 2111 uint64_t i, n, ofs;
cec736d2
LP
2112 int r;
2113
2114 assert(f);
de190aef
LP
2115
2116 n = le64toh(f->header->n_entries);
2117 if (n <= 0)
2118 return 0;
cec736d2 2119
f534928a 2120 if (p == 0)
de190aef 2121 i = direction == DIRECTION_DOWN ? 0 : n - 1;
cec736d2 2122 else {
de190aef
LP
2123 r = generic_array_bisect(f,
2124 le64toh(f->header->entry_array_offset),
2125 le64toh(f->header->n_entries),
2126 p,
2127 test_object_offset,
2128 DIRECTION_DOWN,
2129 NULL, NULL,
2130 &i);
2131 if (r <= 0)
2132 return r;
2133
2134 if (direction == DIRECTION_DOWN) {
2135 if (i >= n - 1)
2136 return 0;
2137
2138 i++;
2139 } else {
2140 if (i <= 0)
2141 return 0;
2142
2143 i--;
2144 }
cec736d2
LP
2145 }
2146
de190aef 2147 /* And jump to it */
fb099c8d
ZJS
2148 r = generic_array_get(f,
2149 le64toh(f->header->entry_array_offset),
2150 i,
2151 ret, &ofs);
2152 if (r <= 0)
2153 return r;
2154
2155 if (p > 0 &&
2156 (direction == DIRECTION_DOWN ? ofs <= p : ofs >= p)) {
2157 log_debug("%s: entry array corrupted at entry %"PRIu64,
2158 f->path, i);
2159 return -EBADMSG;
2160 }
2161
2162 if (offset)
2163 *offset = ofs;
2164
2165 return 1;
de190aef 2166}
cec736d2 2167
de190aef
LP
2168int journal_file_next_entry_for_data(
2169 JournalFile *f,
2170 Object *o, uint64_t p,
2171 uint64_t data_offset,
2172 direction_t direction,
2173 Object **ret, uint64_t *offset) {
2174
2175 uint64_t n, i;
cec736d2 2176 int r;
de190aef 2177 Object *d;
cec736d2
LP
2178
2179 assert(f);
de190aef 2180 assert(p > 0 || !o);
cec736d2 2181
de190aef 2182 r = journal_file_move_to_object(f, OBJECT_DATA, data_offset, &d);
466ccd92 2183 if (r < 0)
de190aef 2184 return r;
cec736d2 2185
de190aef
LP
2186 n = le64toh(d->data.n_entries);
2187 if (n <= 0)
2188 return n;
cec736d2 2189
de190aef
LP
2190 if (!o)
2191 i = direction == DIRECTION_DOWN ? 0 : n - 1;
2192 else {
2193 if (o->object.type != OBJECT_ENTRY)
2194 return -EINVAL;
cec736d2 2195
de190aef
LP
2196 r = generic_array_bisect_plus_one(f,
2197 le64toh(d->data.entry_offset),
2198 le64toh(d->data.entry_array_offset),
2199 le64toh(d->data.n_entries),
2200 p,
2201 test_object_offset,
2202 DIRECTION_DOWN,
2203 NULL, NULL,
2204 &i);
2205
2206 if (r <= 0)
cec736d2
LP
2207 return r;
2208
de190aef
LP
2209 if (direction == DIRECTION_DOWN) {
2210 if (i >= n - 1)
2211 return 0;
cec736d2 2212
de190aef
LP
2213 i++;
2214 } else {
2215 if (i <= 0)
2216 return 0;
cec736d2 2217
de190aef
LP
2218 i--;
2219 }
cec736d2 2220
de190aef 2221 }
cec736d2 2222
de190aef
LP
2223 return generic_array_get_plus_one(f,
2224 le64toh(d->data.entry_offset),
2225 le64toh(d->data.entry_array_offset),
2226 i,
2227 ret, offset);
2228}
cec736d2 2229
cbdca852
LP
2230int journal_file_move_to_entry_by_offset_for_data(
2231 JournalFile *f,
2232 uint64_t data_offset,
2233 uint64_t p,
2234 direction_t direction,
2235 Object **ret, uint64_t *offset) {
2236
2237 int r;
2238 Object *d;
2239
2240 assert(f);
2241
2242 r = journal_file_move_to_object(f, OBJECT_DATA, data_offset, &d);
2243 if (r < 0)
2244 return r;
2245
2246 return generic_array_bisect_plus_one(f,
2247 le64toh(d->data.entry_offset),
2248 le64toh(d->data.entry_array_offset),
2249 le64toh(d->data.n_entries),
2250 p,
2251 test_object_offset,
2252 direction,
2253 ret, offset, NULL);
2254}
2255
2256int journal_file_move_to_entry_by_monotonic_for_data(
2257 JournalFile *f,
2258 uint64_t data_offset,
2259 sd_id128_t boot_id,
2260 uint64_t monotonic,
2261 direction_t direction,
2262 Object **ret, uint64_t *offset) {
2263
cbdca852
LP
2264 Object *o, *d;
2265 int r;
2266 uint64_t b, z;
2267
2268 assert(f);
2269
2270 /* First, seek by time */
47838ab3 2271 r = find_data_object_by_boot_id(f, boot_id, &o, &b);
cbdca852
LP
2272 if (r < 0)
2273 return r;
2274 if (r == 0)
2275 return -ENOENT;
2276
2277 r = generic_array_bisect_plus_one(f,
2278 le64toh(o->data.entry_offset),
2279 le64toh(o->data.entry_array_offset),
2280 le64toh(o->data.n_entries),
2281 monotonic,
2282 test_object_monotonic,
2283 direction,
2284 NULL, &z, NULL);
2285 if (r <= 0)
2286 return r;
2287
2288 /* And now, continue seeking until we find an entry that
2289 * exists in both bisection arrays */
2290
2291 for (;;) {
2292 Object *qo;
2293 uint64_t p, q;
2294
2295 r = journal_file_move_to_object(f, OBJECT_DATA, data_offset, &d);
2296 if (r < 0)
2297 return r;
2298
2299 r = generic_array_bisect_plus_one(f,
2300 le64toh(d->data.entry_offset),
2301 le64toh(d->data.entry_array_offset),
2302 le64toh(d->data.n_entries),
2303 z,
2304 test_object_offset,
2305 direction,
2306 NULL, &p, NULL);
2307 if (r <= 0)
2308 return r;
2309
2310 r = journal_file_move_to_object(f, OBJECT_DATA, b, &o);
2311 if (r < 0)
2312 return r;
2313
2314 r = generic_array_bisect_plus_one(f,
2315 le64toh(o->data.entry_offset),
2316 le64toh(o->data.entry_array_offset),
2317 le64toh(o->data.n_entries),
2318 p,
2319 test_object_offset,
2320 direction,
2321 &qo, &q, NULL);
2322
2323 if (r <= 0)
2324 return r;
2325
2326 if (p == q) {
2327 if (ret)
2328 *ret = qo;
2329 if (offset)
2330 *offset = q;
2331
2332 return 1;
2333 }
2334
2335 z = q;
2336 }
cbdca852
LP
2337}
2338
de190aef
LP
2339int journal_file_move_to_entry_by_seqnum_for_data(
2340 JournalFile *f,
2341 uint64_t data_offset,
2342 uint64_t seqnum,
2343 direction_t direction,
2344 Object **ret, uint64_t *offset) {
cec736d2 2345
de190aef
LP
2346 Object *d;
2347 int r;
cec736d2 2348
91a31dde
LP
2349 assert(f);
2350
de190aef 2351 r = journal_file_move_to_object(f, OBJECT_DATA, data_offset, &d);
91a31dde 2352 if (r < 0)
de190aef 2353 return r;
cec736d2 2354
de190aef
LP
2355 return generic_array_bisect_plus_one(f,
2356 le64toh(d->data.entry_offset),
2357 le64toh(d->data.entry_array_offset),
2358 le64toh(d->data.n_entries),
2359 seqnum,
2360 test_object_seqnum,
2361 direction,
2362 ret, offset, NULL);
2363}
cec736d2 2364
de190aef
LP
2365int journal_file_move_to_entry_by_realtime_for_data(
2366 JournalFile *f,
2367 uint64_t data_offset,
2368 uint64_t realtime,
2369 direction_t direction,
2370 Object **ret, uint64_t *offset) {
2371
2372 Object *d;
2373 int r;
2374
91a31dde
LP
2375 assert(f);
2376
de190aef 2377 r = journal_file_move_to_object(f, OBJECT_DATA, data_offset, &d);
91a31dde 2378 if (r < 0)
de190aef
LP
2379 return r;
2380
2381 return generic_array_bisect_plus_one(f,
2382 le64toh(d->data.entry_offset),
2383 le64toh(d->data.entry_array_offset),
2384 le64toh(d->data.n_entries),
2385 realtime,
2386 test_object_realtime,
2387 direction,
2388 ret, offset, NULL);
cec736d2
LP
2389}
2390
0284adc6 2391void journal_file_dump(JournalFile *f) {
7560fffc 2392 Object *o;
7560fffc 2393 int r;
0284adc6 2394 uint64_t p;
7560fffc
LP
2395
2396 assert(f);
2397
0284adc6 2398 journal_file_print_header(f);
7560fffc 2399
0284adc6
LP
2400 p = le64toh(f->header->header_size);
2401 while (p != 0) {
d05089d8 2402 r = journal_file_move_to_object(f, OBJECT_UNUSED, p, &o);
0284adc6
LP
2403 if (r < 0)
2404 goto fail;
7560fffc 2405
0284adc6 2406 switch (o->object.type) {
d98cc1f2 2407
0284adc6
LP
2408 case OBJECT_UNUSED:
2409 printf("Type: OBJECT_UNUSED\n");
2410 break;
d98cc1f2 2411
0284adc6
LP
2412 case OBJECT_DATA:
2413 printf("Type: OBJECT_DATA\n");
2414 break;
7560fffc 2415
3c1668da
LP
2416 case OBJECT_FIELD:
2417 printf("Type: OBJECT_FIELD\n");
2418 break;
2419
0284adc6 2420 case OBJECT_ENTRY:
507f22bd
ZJS
2421 printf("Type: OBJECT_ENTRY seqnum=%"PRIu64" monotonic=%"PRIu64" realtime=%"PRIu64"\n",
2422 le64toh(o->entry.seqnum),
2423 le64toh(o->entry.monotonic),
2424 le64toh(o->entry.realtime));
0284adc6 2425 break;
7560fffc 2426
0284adc6
LP
2427 case OBJECT_FIELD_HASH_TABLE:
2428 printf("Type: OBJECT_FIELD_HASH_TABLE\n");
2429 break;
7560fffc 2430
0284adc6
LP
2431 case OBJECT_DATA_HASH_TABLE:
2432 printf("Type: OBJECT_DATA_HASH_TABLE\n");
2433 break;
7560fffc 2434
0284adc6
LP
2435 case OBJECT_ENTRY_ARRAY:
2436 printf("Type: OBJECT_ENTRY_ARRAY\n");
2437 break;
7560fffc 2438
0284adc6 2439 case OBJECT_TAG:
507f22bd
ZJS
2440 printf("Type: OBJECT_TAG seqnum=%"PRIu64" epoch=%"PRIu64"\n",
2441 le64toh(o->tag.seqnum),
2442 le64toh(o->tag.epoch));
0284adc6 2443 break;
3c1668da
LP
2444
2445 default:
8facc349 2446 printf("Type: unknown (%i)\n", o->object.type);
3c1668da 2447 break;
0284adc6 2448 }
7560fffc 2449
d89c8fdf
ZJS
2450 if (o->object.flags & OBJECT_COMPRESSION_MASK)
2451 printf("Flags: %s\n",
2452 object_compressed_to_string(o->object.flags & OBJECT_COMPRESSION_MASK));
7560fffc 2453
0284adc6
LP
2454 if (p == le64toh(f->header->tail_object_offset))
2455 p = 0;
2456 else
2457 p = p + ALIGN64(le64toh(o->object.size));
2458 }
7560fffc 2459
0284adc6
LP
2460 return;
2461fail:
2462 log_error("File corrupt");
7560fffc
LP
2463}
2464
718fe4b1
ZJS
2465static const char* format_timestamp_safe(char *buf, size_t l, usec_t t) {
2466 const char *x;
2467
2468 x = format_timestamp(buf, l, t);
2469 if (x)
2470 return x;
2471 return " --- ";
2472}
2473
0284adc6 2474void journal_file_print_header(JournalFile *f) {
2765b7bb 2475 char a[33], b[33], c[33], d[33];
ed375beb 2476 char x[FORMAT_TIMESTAMP_MAX], y[FORMAT_TIMESTAMP_MAX], z[FORMAT_TIMESTAMP_MAX];
a1a03e30
LP
2477 struct stat st;
2478 char bytes[FORMAT_BYTES_MAX];
7560fffc
LP
2479
2480 assert(f);
7560fffc 2481
0284adc6
LP
2482 printf("File Path: %s\n"
2483 "File ID: %s\n"
2484 "Machine ID: %s\n"
2485 "Boot ID: %s\n"
2486 "Sequential Number ID: %s\n"
2487 "State: %s\n"
2488 "Compatible Flags:%s%s\n"
d89c8fdf 2489 "Incompatible Flags:%s%s%s\n"
507f22bd
ZJS
2490 "Header size: %"PRIu64"\n"
2491 "Arena size: %"PRIu64"\n"
2492 "Data Hash Table Size: %"PRIu64"\n"
2493 "Field Hash Table Size: %"PRIu64"\n"
0284adc6 2494 "Rotate Suggested: %s\n"
507f22bd
ZJS
2495 "Head Sequential Number: %"PRIu64"\n"
2496 "Tail Sequential Number: %"PRIu64"\n"
0284adc6 2497 "Head Realtime Timestamp: %s\n"
3223f44f 2498 "Tail Realtime Timestamp: %s\n"
ed375beb 2499 "Tail Monotonic Timestamp: %s\n"
507f22bd
ZJS
2500 "Objects: %"PRIu64"\n"
2501 "Entry Objects: %"PRIu64"\n",
0284adc6
LP
2502 f->path,
2503 sd_id128_to_string(f->header->file_id, a),
2504 sd_id128_to_string(f->header->machine_id, b),
2505 sd_id128_to_string(f->header->boot_id, c),
2765b7bb 2506 sd_id128_to_string(f->header->seqnum_id, d),
3223f44f
LP
2507 f->header->state == STATE_OFFLINE ? "OFFLINE" :
2508 f->header->state == STATE_ONLINE ? "ONLINE" :
2509 f->header->state == STATE_ARCHIVED ? "ARCHIVED" : "UNKNOWN",
8088cbd3 2510 JOURNAL_HEADER_SEALED(f->header) ? " SEALED" : "",
d89c8fdf
ZJS
2511 (le32toh(f->header->compatible_flags) & ~HEADER_COMPATIBLE_ANY) ? " ???" : "",
2512 JOURNAL_HEADER_COMPRESSED_XZ(f->header) ? " COMPRESSED-XZ" : "",
2513 JOURNAL_HEADER_COMPRESSED_LZ4(f->header) ? " COMPRESSED-LZ4" : "",
2514 (le32toh(f->header->incompatible_flags) & ~HEADER_INCOMPATIBLE_ANY) ? " ???" : "",
507f22bd
ZJS
2515 le64toh(f->header->header_size),
2516 le64toh(f->header->arena_size),
2517 le64toh(f->header->data_hash_table_size) / sizeof(HashItem),
2518 le64toh(f->header->field_hash_table_size) / sizeof(HashItem),
fb0951b0 2519 yes_no(journal_file_rotate_suggested(f, 0)),
507f22bd
ZJS
2520 le64toh(f->header->head_entry_seqnum),
2521 le64toh(f->header->tail_entry_seqnum),
718fe4b1
ZJS
2522 format_timestamp_safe(x, sizeof(x), le64toh(f->header->head_entry_realtime)),
2523 format_timestamp_safe(y, sizeof(y), le64toh(f->header->tail_entry_realtime)),
ed375beb 2524 format_timespan(z, sizeof(z), le64toh(f->header->tail_entry_monotonic), USEC_PER_MSEC),
507f22bd
ZJS
2525 le64toh(f->header->n_objects),
2526 le64toh(f->header->n_entries));
7560fffc 2527
0284adc6 2528 if (JOURNAL_HEADER_CONTAINS(f->header, n_data))
507f22bd 2529 printf("Data Objects: %"PRIu64"\n"
0284adc6 2530 "Data Hash Table Fill: %.1f%%\n",
507f22bd 2531 le64toh(f->header->n_data),
0284adc6 2532 100.0 * (double) le64toh(f->header->n_data) / ((double) (le64toh(f->header->data_hash_table_size) / sizeof(HashItem))));
7560fffc 2533
0284adc6 2534 if (JOURNAL_HEADER_CONTAINS(f->header, n_fields))
507f22bd 2535 printf("Field Objects: %"PRIu64"\n"
0284adc6 2536 "Field Hash Table Fill: %.1f%%\n",
507f22bd 2537 le64toh(f->header->n_fields),
0284adc6 2538 100.0 * (double) le64toh(f->header->n_fields) / ((double) (le64toh(f->header->field_hash_table_size) / sizeof(HashItem))));
3223f44f
LP
2539
2540 if (JOURNAL_HEADER_CONTAINS(f->header, n_tags))
507f22bd
ZJS
2541 printf("Tag Objects: %"PRIu64"\n",
2542 le64toh(f->header->n_tags));
3223f44f 2543 if (JOURNAL_HEADER_CONTAINS(f->header, n_entry_arrays))
507f22bd
ZJS
2544 printf("Entry Array Objects: %"PRIu64"\n",
2545 le64toh(f->header->n_entry_arrays));
a1a03e30
LP
2546
2547 if (fstat(f->fd, &st) >= 0)
59f448cf 2548 printf("Disk usage: %s\n", format_bytes(bytes, sizeof(bytes), (uint64_t) st.st_blocks * 512ULL));
7560fffc
LP
2549}
2550
fc68c929
LP
2551static int journal_file_warn_btrfs(JournalFile *f) {
2552 unsigned attrs;
2553 int r;
2554
2555 assert(f);
2556
2557 /* Before we write anything, check if the COW logic is turned
2558 * off on btrfs. Given our write pattern that is quite
2559 * unfriendly to COW file systems this should greatly improve
2560 * performance on COW file systems, such as btrfs, at the
2561 * expense of data integrity features (which shouldn't be too
2562 * bad, given that we do our own checksumming). */
2563
2564 r = btrfs_is_filesystem(f->fd);
2565 if (r < 0)
2566 return log_warning_errno(r, "Failed to determine if journal is on btrfs: %m");
2567 if (!r)
2568 return 0;
2569
2570 r = read_attr_fd(f->fd, &attrs);
2571 if (r < 0)
2572 return log_warning_errno(r, "Failed to read file attributes: %m");
2573
2574 if (attrs & FS_NOCOW_FL) {
2575 log_debug("Detected btrfs file system with copy-on-write disabled, all is good.");
2576 return 0;
2577 }
2578
2579 log_notice("Creating journal file %s on a btrfs file system, and copy-on-write is enabled. "
2580 "This is likely to slow down journal access substantially, please consider turning "
2581 "off the copy-on-write file attribute on the journal directory, using chattr +C.", f->path);
2582
2583 return 1;
2584}
2585
0284adc6
LP
2586int journal_file_open(
2587 const char *fname,
2588 int flags,
2589 mode_t mode,
2590 bool compress,
baed47c3 2591 bool seal,
0284adc6
LP
2592 JournalMetrics *metrics,
2593 MMapCache *mmap_cache,
2594 JournalFile *template,
2595 JournalFile **ret) {
7560fffc 2596
fa6ac760 2597 bool newly_created = false;
0284adc6 2598 JournalFile *f;
fa6ac760 2599 void *h;
0284adc6 2600 int r;
7560fffc 2601
0284adc6 2602 assert(fname);
0559d3a5 2603 assert(ret);
7560fffc 2604
0284adc6
LP
2605 if ((flags & O_ACCMODE) != O_RDONLY &&
2606 (flags & O_ACCMODE) != O_RDWR)
2607 return -EINVAL;
7560fffc 2608
a0108012
LP
2609 if (!endswith(fname, ".journal") &&
2610 !endswith(fname, ".journal~"))
0284adc6 2611 return -EINVAL;
7560fffc 2612
0284adc6
LP
2613 f = new0(JournalFile, 1);
2614 if (!f)
2615 return -ENOMEM;
7560fffc 2616
0284adc6
LP
2617 f->fd = -1;
2618 f->mode = mode;
7560fffc 2619
0284adc6
LP
2620 f->flags = flags;
2621 f->prot = prot_from_flags(flags);
2622 f->writable = (flags & O_ACCMODE) != O_RDONLY;
92261977 2623#if defined(HAVE_LZ4)
d89c8fdf
ZJS
2624 f->compress_lz4 = compress;
2625#elif defined(HAVE_XZ)
2626 f->compress_xz = compress;
48b61739 2627#endif
49a32d43 2628#ifdef HAVE_GCRYPT
baed47c3 2629 f->seal = seal;
49a32d43 2630#endif
7560fffc 2631
0284adc6
LP
2632 if (mmap_cache)
2633 f->mmap = mmap_cache_ref(mmap_cache);
2634 else {
84168d80 2635 f->mmap = mmap_cache_new();
0284adc6
LP
2636 if (!f->mmap) {
2637 r = -ENOMEM;
2638 goto fail;
2639 }
2640 }
7560fffc 2641
0284adc6
LP
2642 f->path = strdup(fname);
2643 if (!f->path) {
2644 r = -ENOMEM;
2645 goto fail;
2646 }
7560fffc 2647
4743015d 2648 f->chain_cache = ordered_hashmap_new(&uint64_hash_ops);
a4bcff5b
LP
2649 if (!f->chain_cache) {
2650 r = -ENOMEM;
2651 goto fail;
2652 }
2653
0284adc6
LP
2654 f->fd = open(f->path, f->flags|O_CLOEXEC, f->mode);
2655 if (f->fd < 0) {
2656 r = -errno;
2657 goto fail;
7560fffc 2658 }
7560fffc 2659
2678031a
LP
2660 r = journal_file_fstat(f);
2661 if (r < 0)
0284adc6 2662 goto fail;
7560fffc 2663
0284adc6 2664 if (f->last_stat.st_size == 0 && f->writable) {
11689d2a 2665
fc68c929 2666 (void) journal_file_warn_btrfs(f);
11689d2a 2667
fb0951b0
LP
2668 /* Let's attach the creation time to the journal file,
2669 * so that the vacuuming code knows the age of this
2670 * file even if the file might end up corrupted one
2671 * day... Ideally we'd just use the creation time many
2672 * file systems maintain for each file, but there is
2673 * currently no usable API to query this, hence let's
2674 * emulate this via extended attributes. If extended
2675 * attributes are not supported we'll just skip this,
7517e174 2676 * and rely solely on mtime/atime/ctime of the file. */
fb0951b0 2677
d61b600d 2678 fd_setcrtime(f->fd, 0);
7560fffc 2679
feb12d3e 2680#ifdef HAVE_GCRYPT
0284adc6 2681 /* Try to load the FSPRG state, and if we can't, then
baed47c3 2682 * just don't do sealing */
49a32d43
LP
2683 if (f->seal) {
2684 r = journal_file_fss_load(f);
2685 if (r < 0)
2686 f->seal = false;
2687 }
feb12d3e 2688#endif
7560fffc 2689
0284adc6
LP
2690 r = journal_file_init_header(f, template);
2691 if (r < 0)
2692 goto fail;
7560fffc 2693
2678031a
LP
2694 r = journal_file_fstat(f);
2695 if (r < 0)
0284adc6 2696 goto fail;
fb0951b0
LP
2697
2698 newly_created = true;
0284adc6 2699 }
7560fffc 2700
0284adc6
LP
2701 if (f->last_stat.st_size < (off_t) HEADER_SIZE_MIN) {
2702 r = -EIO;
2703 goto fail;
2704 }
7560fffc 2705
fa6ac760 2706 r = mmap_cache_get(f->mmap, f->fd, f->prot, CONTEXT_HEADER, true, 0, PAGE_ALIGN(sizeof(Header)), &f->last_stat, &h);
977eaa1e 2707 if (r < 0)
0284adc6 2708 goto fail;
7560fffc 2709
fa6ac760
LP
2710 f->header = h;
2711
0284adc6
LP
2712 if (!newly_created) {
2713 r = journal_file_verify_header(f);
2714 if (r < 0)
2715 goto fail;
2716 }
7560fffc 2717
feb12d3e 2718#ifdef HAVE_GCRYPT
0284adc6 2719 if (!newly_created && f->writable) {
baed47c3 2720 r = journal_file_fss_load(f);
0284adc6
LP
2721 if (r < 0)
2722 goto fail;
2723 }
feb12d3e 2724#endif
cec736d2
LP
2725
2726 if (f->writable) {
4a92baf3
LP
2727 if (metrics) {
2728 journal_default_metrics(metrics, f->fd);
2729 f->metrics = *metrics;
2730 } else if (template)
2731 f->metrics = template->metrics;
2732
cec736d2
LP
2733 r = journal_file_refresh_header(f);
2734 if (r < 0)
2735 goto fail;
2736 }
2737
feb12d3e 2738#ifdef HAVE_GCRYPT
baed47c3 2739 r = journal_file_hmac_setup(f);
14d10188
LP
2740 if (r < 0)
2741 goto fail;
feb12d3e 2742#endif
14d10188 2743
cec736d2 2744 if (newly_created) {
de190aef 2745 r = journal_file_setup_field_hash_table(f);
cec736d2
LP
2746 if (r < 0)
2747 goto fail;
2748
de190aef 2749 r = journal_file_setup_data_hash_table(f);
cec736d2
LP
2750 if (r < 0)
2751 goto fail;
7560fffc 2752
feb12d3e 2753#ifdef HAVE_GCRYPT
7560fffc
LP
2754 r = journal_file_append_first_tag(f);
2755 if (r < 0)
2756 goto fail;
feb12d3e 2757#endif
cec736d2
LP
2758 }
2759
fa6ac760
LP
2760 if (mmap_cache_got_sigbus(f->mmap, f->fd)) {
2761 r = -EIO;
2762 goto fail;
2763 }
2764
0559d3a5 2765 *ret = f;
cec736d2
LP
2766 return 0;
2767
2768fail:
fa6ac760
LP
2769 if (f->fd >= 0 && mmap_cache_got_sigbus(f->mmap, f->fd))
2770 r = -EIO;
2771
cec736d2
LP
2772 journal_file_close(f);
2773
2774 return r;
2775}
0ac38b70 2776
baed47c3 2777int journal_file_rotate(JournalFile **f, bool compress, bool seal) {
57535f47 2778 _cleanup_free_ char *p = NULL;
0ac38b70
LP
2779 size_t l;
2780 JournalFile *old_file, *new_file = NULL;
2781 int r;
2782
2783 assert(f);
2784 assert(*f);
2785
2786 old_file = *f;
2787
2788 if (!old_file->writable)
2789 return -EINVAL;
2790
2791 if (!endswith(old_file->path, ".journal"))
2792 return -EINVAL;
2793
2794 l = strlen(old_file->path);
57535f47
ZJS
2795 r = asprintf(&p, "%.*s@" SD_ID128_FORMAT_STR "-%016"PRIx64"-%016"PRIx64".journal",
2796 (int) l - 8, old_file->path,
2797 SD_ID128_FORMAT_VAL(old_file->header->seqnum_id),
2798 le64toh((*f)->header->head_entry_seqnum),
2799 le64toh((*f)->header->head_entry_realtime));
2800 if (r < 0)
0ac38b70
LP
2801 return -ENOMEM;
2802
2678031a
LP
2803 /* Try to rename the file to the archived version. If the file
2804 * already was deleted, we'll get ENOENT, let's ignore that
2805 * case. */
0ac38b70 2806 r = rename(old_file->path, p);
2678031a 2807 if (r < 0 && errno != ENOENT)
0ac38b70
LP
2808 return -errno;
2809
ccdbaf91 2810 old_file->header->state = STATE_ARCHIVED;
0ac38b70 2811
f27a3864
LP
2812 /* Currently, btrfs is not very good with out write patterns
2813 * and fragments heavily. Let's defrag our journal files when
2814 * we archive them */
2815 old_file->defrag_on_close = true;
2816
baed47c3 2817 r = journal_file_open(old_file->path, old_file->flags, old_file->mode, compress, seal, NULL, old_file->mmap, old_file, &new_file);
0ac38b70
LP
2818 journal_file_close(old_file);
2819
2820 *f = new_file;
2821 return r;
2822}
2823
9447a7f1
LP
2824int journal_file_open_reliably(
2825 const char *fname,
2826 int flags,
2827 mode_t mode,
7560fffc 2828 bool compress,
baed47c3 2829 bool seal,
4a92baf3 2830 JournalMetrics *metrics,
27370278 2831 MMapCache *mmap_cache,
9447a7f1
LP
2832 JournalFile *template,
2833 JournalFile **ret) {
2834
2835 int r;
2836 size_t l;
ed375beb 2837 _cleanup_free_ char *p = NULL;
9447a7f1 2838
070052ab 2839 r = journal_file_open(fname, flags, mode, compress, seal, metrics, mmap_cache, template, ret);
288359db
ZJS
2840 if (!IN_SET(r,
2841 -EBADMSG, /* corrupted */
2842 -ENODATA, /* truncated */
2843 -EHOSTDOWN, /* other machine */
2844 -EPROTONOSUPPORT, /* incompatible feature */
2845 -EBUSY, /* unclean shutdown */
2846 -ESHUTDOWN, /* already archived */
2847 -EIO, /* IO error, including SIGBUS on mmap */
2848 -EIDRM /* File has been deleted */))
9447a7f1
LP
2849 return r;
2850
2851 if ((flags & O_ACCMODE) == O_RDONLY)
2852 return r;
2853
2854 if (!(flags & O_CREAT))
2855 return r;
2856
7560fffc
LP
2857 if (!endswith(fname, ".journal"))
2858 return r;
2859
5c70eab4
LP
2860 /* The file is corrupted. Rotate it away and try it again (but only once) */
2861
9447a7f1 2862 l = strlen(fname);
d587eca5 2863 if (asprintf(&p, "%.*s@%016"PRIx64 "-%016"PRIx64 ".journal~",
57535f47 2864 (int) l - 8, fname,
d587eca5 2865 now(CLOCK_REALTIME),
9bf3b535 2866 random_u64()) < 0)
9447a7f1
LP
2867 return -ENOMEM;
2868
65089b82 2869 if (rename(fname, p) < 0)
9447a7f1
LP
2870 return -errno;
2871
f27a3864
LP
2872 /* btrfs doesn't cope well with our write pattern and
2873 * fragments heavily. Let's defrag all files we rotate */
11689d2a
LP
2874
2875 (void) chattr_path(p, false, FS_NOCOW_FL);
f27a3864
LP
2876 (void) btrfs_defrag(p);
2877
65089b82 2878 log_warning_errno(r, "File %s corrupted or uncleanly shut down, renaming and replacing.", fname);
9447a7f1 2879
070052ab 2880 return journal_file_open(fname, flags, mode, compress, seal, metrics, mmap_cache, template, ret);
9447a7f1
LP
2881}
2882
cf244689
LP
2883int journal_file_copy_entry(JournalFile *from, JournalFile *to, Object *o, uint64_t p, uint64_t *seqnum, Object **ret, uint64_t *offset) {
2884 uint64_t i, n;
2885 uint64_t q, xor_hash = 0;
2886 int r;
2887 EntryItem *items;
2888 dual_timestamp ts;
2889
2890 assert(from);
2891 assert(to);
2892 assert(o);
2893 assert(p);
2894
2895 if (!to->writable)
2896 return -EPERM;
2897
2898 ts.monotonic = le64toh(o->entry.monotonic);
2899 ts.realtime = le64toh(o->entry.realtime);
2900
cf244689 2901 n = journal_file_entry_n_items(o);
4faa7004
TA
2902 /* alloca() can't take 0, hence let's allocate at least one */
2903 items = alloca(sizeof(EntryItem) * MAX(1u, n));
cf244689
LP
2904
2905 for (i = 0; i < n; i++) {
4fd052ae
FC
2906 uint64_t l, h;
2907 le64_t le_hash;
cf244689
LP
2908 size_t t;
2909 void *data;
2910 Object *u;
2911
2912 q = le64toh(o->entry.items[i].object_offset);
2913 le_hash = o->entry.items[i].hash;
2914
2915 r = journal_file_move_to_object(from, OBJECT_DATA, q, &o);
2916 if (r < 0)
2917 return r;
2918
2919 if (le_hash != o->data.hash)
2920 return -EBADMSG;
2921
2922 l = le64toh(o->object.size) - offsetof(Object, data.payload);
2923 t = (size_t) l;
2924
2925 /* We hit the limit on 32bit machines */
2926 if ((uint64_t) t != l)
2927 return -E2BIG;
2928
d89c8fdf 2929 if (o->object.flags & OBJECT_COMPRESSION_MASK) {
3b1a55e1 2930#if defined(HAVE_XZ) || defined(HAVE_LZ4)
a7f7d1bd 2931 size_t rsize = 0;
cf244689 2932
d89c8fdf
ZJS
2933 r = decompress_blob(o->object.flags & OBJECT_COMPRESSION_MASK,
2934 o->data.payload, l, &from->compress_buffer, &from->compress_buffer_size, &rsize, 0);
2935 if (r < 0)
2936 return r;
cf244689
LP
2937
2938 data = from->compress_buffer;
2939 l = rsize;
3b1a55e1
ZJS
2940#else
2941 return -EPROTONOSUPPORT;
2942#endif
cf244689
LP
2943 } else
2944 data = o->data.payload;
2945
2946 r = journal_file_append_data(to, data, l, &u, &h);
2947 if (r < 0)
2948 return r;
2949
2950 xor_hash ^= le64toh(u->data.hash);
2951 items[i].object_offset = htole64(h);
2952 items[i].hash = u->data.hash;
2953
2954 r = journal_file_move_to_object(from, OBJECT_ENTRY, p, &o);
2955 if (r < 0)
2956 return r;
2957 }
2958
fa6ac760
LP
2959 r = journal_file_append_entry_internal(to, &ts, xor_hash, items, n, seqnum, ret, offset);
2960
2961 if (mmap_cache_got_sigbus(to->mmap, to->fd))
2962 return -EIO;
2963
2964 return r;
cf244689 2965}
babfc091 2966
8580d1f7
LP
2967void journal_reset_metrics(JournalMetrics *m) {
2968 assert(m);
2969
2970 /* Set everything to "pick automatic values". */
2971
2972 *m = (JournalMetrics) {
2973 .min_use = (uint64_t) -1,
2974 .max_use = (uint64_t) -1,
2975 .min_size = (uint64_t) -1,
2976 .max_size = (uint64_t) -1,
2977 .keep_free = (uint64_t) -1,
2978 .n_max_files = (uint64_t) -1,
2979 };
2980}
2981
babfc091 2982void journal_default_metrics(JournalMetrics *m, int fd) {
8580d1f7 2983 char a[FORMAT_BYTES_MAX], b[FORMAT_BYTES_MAX], c[FORMAT_BYTES_MAX], d[FORMAT_BYTES_MAX], e[FORMAT_BYTES_MAX];
babfc091 2984 struct statvfs ss;
8580d1f7 2985 uint64_t fs_size;
babfc091
LP
2986
2987 assert(m);
2988 assert(fd >= 0);
2989
2990 if (fstatvfs(fd, &ss) >= 0)
2991 fs_size = ss.f_frsize * ss.f_blocks;
8580d1f7
LP
2992 else {
2993 log_debug_errno(errno, "Failed to detremine disk size: %m");
2994 fs_size = 0;
2995 }
babfc091
LP
2996
2997 if (m->max_use == (uint64_t) -1) {
2998
2999 if (fs_size > 0) {
3000 m->max_use = PAGE_ALIGN(fs_size / 10); /* 10% of file system size */
3001
3002 if (m->max_use > DEFAULT_MAX_USE_UPPER)
3003 m->max_use = DEFAULT_MAX_USE_UPPER;
3004
3005 if (m->max_use < DEFAULT_MAX_USE_LOWER)
3006 m->max_use = DEFAULT_MAX_USE_LOWER;
3007 } else
3008 m->max_use = DEFAULT_MAX_USE_LOWER;
3009 } else {
3010 m->max_use = PAGE_ALIGN(m->max_use);
3011
8580d1f7 3012 if (m->max_use != 0 && m->max_use < JOURNAL_FILE_SIZE_MIN*2)
babfc091
LP
3013 m->max_use = JOURNAL_FILE_SIZE_MIN*2;
3014 }
3015
8580d1f7
LP
3016 if (m->min_use == (uint64_t) -1)
3017 m->min_use = DEFAULT_MIN_USE;
3018
3019 if (m->min_use > m->max_use)
3020 m->min_use = m->max_use;
3021
babfc091
LP
3022 if (m->max_size == (uint64_t) -1) {
3023 m->max_size = PAGE_ALIGN(m->max_use / 8); /* 8 chunks */
3024
3025 if (m->max_size > DEFAULT_MAX_SIZE_UPPER)
3026 m->max_size = DEFAULT_MAX_SIZE_UPPER;
3027 } else
3028 m->max_size = PAGE_ALIGN(m->max_size);
3029
8580d1f7
LP
3030 if (m->max_size != 0) {
3031 if (m->max_size < JOURNAL_FILE_SIZE_MIN)
3032 m->max_size = JOURNAL_FILE_SIZE_MIN;
babfc091 3033
8580d1f7
LP
3034 if (m->max_use != 0 && m->max_size*2 > m->max_use)
3035 m->max_use = m->max_size*2;
3036 }
babfc091
LP
3037
3038 if (m->min_size == (uint64_t) -1)
3039 m->min_size = JOURNAL_FILE_SIZE_MIN;
3040 else {
3041 m->min_size = PAGE_ALIGN(m->min_size);
3042
3043 if (m->min_size < JOURNAL_FILE_SIZE_MIN)
3044 m->min_size = JOURNAL_FILE_SIZE_MIN;
3045
8580d1f7 3046 if (m->max_size != 0 && m->min_size > m->max_size)
babfc091
LP
3047 m->max_size = m->min_size;
3048 }
3049
3050 if (m->keep_free == (uint64_t) -1) {
3051
3052 if (fs_size > 0) {
8621b110 3053 m->keep_free = PAGE_ALIGN(fs_size * 3 / 20); /* 15% of file system size */
babfc091
LP
3054
3055 if (m->keep_free > DEFAULT_KEEP_FREE_UPPER)
3056 m->keep_free = DEFAULT_KEEP_FREE_UPPER;
3057
3058 } else
3059 m->keep_free = DEFAULT_KEEP_FREE;
3060 }
3061
8580d1f7
LP
3062 if (m->n_max_files == (uint64_t) -1)
3063 m->n_max_files = DEFAULT_N_MAX_FILES;
3064
3065 log_debug("Fixed min_use=%s max_use=%s max_size=%s min_size=%s keep_free=%s n_max_files=%" PRIu64,
3066 format_bytes(a, sizeof(a), m->min_use),
3067 format_bytes(b, sizeof(b), m->max_use),
3068 format_bytes(c, sizeof(c), m->max_size),
3069 format_bytes(d, sizeof(d), m->min_size),
3070 format_bytes(e, sizeof(e), m->keep_free),
3071 m->n_max_files);
babfc091 3072}
08984293
LP
3073
3074int journal_file_get_cutoff_realtime_usec(JournalFile *f, usec_t *from, usec_t *to) {
08984293
LP
3075 assert(f);
3076 assert(from || to);
3077
3078 if (from) {
162566a4
LP
3079 if (f->header->head_entry_realtime == 0)
3080 return -ENOENT;
08984293 3081
162566a4 3082 *from = le64toh(f->header->head_entry_realtime);
08984293
LP
3083 }
3084
3085 if (to) {
162566a4
LP
3086 if (f->header->tail_entry_realtime == 0)
3087 return -ENOENT;
08984293 3088
162566a4 3089 *to = le64toh(f->header->tail_entry_realtime);
08984293
LP
3090 }
3091
3092 return 1;
3093}
3094
3095int journal_file_get_cutoff_monotonic_usec(JournalFile *f, sd_id128_t boot_id, usec_t *from, usec_t *to) {
08984293
LP
3096 Object *o;
3097 uint64_t p;
3098 int r;
3099
3100 assert(f);
3101 assert(from || to);
3102
47838ab3 3103 r = find_data_object_by_boot_id(f, boot_id, &o, &p);
08984293
LP
3104 if (r <= 0)
3105 return r;
3106
3107 if (le64toh(o->data.n_entries) <= 0)
3108 return 0;
3109
3110 if (from) {
3111 r = journal_file_move_to_object(f, OBJECT_ENTRY, le64toh(o->data.entry_offset), &o);
3112 if (r < 0)
3113 return r;
3114
3115 *from = le64toh(o->entry.monotonic);
3116 }
3117
3118 if (to) {
3119 r = journal_file_move_to_object(f, OBJECT_DATA, p, &o);
3120 if (r < 0)
3121 return r;
3122
3123 r = generic_array_get_plus_one(f,
3124 le64toh(o->data.entry_offset),
3125 le64toh(o->data.entry_array_offset),
3126 le64toh(o->data.n_entries)-1,
3127 &o, NULL);
3128 if (r <= 0)
3129 return r;
3130
3131 *to = le64toh(o->entry.monotonic);
3132 }
3133
3134 return 1;
3135}
dca6219e 3136
fb0951b0 3137bool journal_file_rotate_suggested(JournalFile *f, usec_t max_file_usec) {
dca6219e
LP
3138 assert(f);
3139
3140 /* If we gained new header fields we gained new features,
3141 * hence suggest a rotation */
361f9cbc
LP
3142 if (le64toh(f->header->header_size) < sizeof(Header)) {
3143 log_debug("%s uses an outdated header, suggesting rotation.", f->path);
dca6219e 3144 return true;
361f9cbc 3145 }
dca6219e
LP
3146
3147 /* Let's check if the hash tables grew over a certain fill
3148 * level (75%, borrowing this value from Java's hash table
3149 * implementation), and if so suggest a rotation. To calculate
3150 * the fill level we need the n_data field, which only exists
3151 * in newer versions. */
3152
3153 if (JOURNAL_HEADER_CONTAINS(f->header, n_data))
361f9cbc 3154 if (le64toh(f->header->n_data) * 4ULL > (le64toh(f->header->data_hash_table_size) / sizeof(HashItem)) * 3ULL) {
507f22bd 3155 log_debug("Data hash table of %s has a fill level at %.1f (%"PRIu64" of %"PRIu64" items, %llu file size, %"PRIu64" bytes per hash table item), suggesting rotation.",
361f9cbc
LP
3156 f->path,
3157 100.0 * (double) le64toh(f->header->n_data) / ((double) (le64toh(f->header->data_hash_table_size) / sizeof(HashItem))),
507f22bd
ZJS
3158 le64toh(f->header->n_data),
3159 le64toh(f->header->data_hash_table_size) / sizeof(HashItem),
3160 (unsigned long long) f->last_stat.st_size,
3161 f->last_stat.st_size / le64toh(f->header->n_data));
dca6219e 3162 return true;
361f9cbc 3163 }
dca6219e
LP
3164
3165 if (JOURNAL_HEADER_CONTAINS(f->header, n_fields))
361f9cbc 3166 if (le64toh(f->header->n_fields) * 4ULL > (le64toh(f->header->field_hash_table_size) / sizeof(HashItem)) * 3ULL) {
507f22bd 3167 log_debug("Field hash table of %s has a fill level at %.1f (%"PRIu64" of %"PRIu64" items), suggesting rotation.",
361f9cbc
LP
3168 f->path,
3169 100.0 * (double) le64toh(f->header->n_fields) / ((double) (le64toh(f->header->field_hash_table_size) / sizeof(HashItem))),
507f22bd
ZJS
3170 le64toh(f->header->n_fields),
3171 le64toh(f->header->field_hash_table_size) / sizeof(HashItem));
dca6219e 3172 return true;
361f9cbc 3173 }
dca6219e 3174
0598fd4a
LP
3175 /* Are the data objects properly indexed by field objects? */
3176 if (JOURNAL_HEADER_CONTAINS(f->header, n_data) &&
3177 JOURNAL_HEADER_CONTAINS(f->header, n_fields) &&
3178 le64toh(f->header->n_data) > 0 &&
3179 le64toh(f->header->n_fields) == 0)
3180 return true;
3181
fb0951b0
LP
3182 if (max_file_usec > 0) {
3183 usec_t t, h;
3184
3185 h = le64toh(f->header->head_entry_realtime);
3186 t = now(CLOCK_REALTIME);
3187
3188 if (h > 0 && t > h + max_file_usec)
3189 return true;
3190 }
3191
dca6219e
LP
3192 return false;
3193}