]> git.ipfire.org Git - thirdparty/systemd.git/blame - src/journal/journal-file.c
btrfs-util: be more careful when invoking btrfs file system ioctls
[thirdparty/systemd.git] / src / journal / journal-file.c
CommitLineData
cec736d2
LP
1/*-*- Mode: C; c-basic-offset: 8; indent-tabs-mode: nil -*-*/
2
3/***
4 This file is part of systemd.
5
6 Copyright 2011 Lennart Poettering
7
8 systemd is free software; you can redistribute it and/or modify it
5430f7f2
LP
9 under the terms of the GNU Lesser General Public License as published by
10 the Free Software Foundation; either version 2.1 of the License, or
cec736d2
LP
11 (at your option) any later version.
12
13 systemd is distributed in the hope that it will be useful, but
14 WITHOUT ANY WARRANTY; without even the implied warranty of
15 MERCHANTABILITY or FITNESS FOR A PARTICULAR PURPOSE. See the GNU
5430f7f2 16 Lesser General Public License for more details.
cec736d2 17
5430f7f2 18 You should have received a copy of the GNU Lesser General Public License
cec736d2
LP
19 along with systemd; If not, see <http://www.gnu.org/licenses/>.
20***/
21
22#include <sys/mman.h>
23#include <errno.h>
24#include <sys/uio.h>
25#include <unistd.h>
26#include <sys/statvfs.h>
27#include <fcntl.h>
28#include <stddef.h>
11689d2a 29#include <linux/fs.h>
fb0951b0 30
f27a3864 31#include "btrfs-util.h"
cec736d2
LP
32#include "journal-def.h"
33#include "journal-file.h"
0284adc6 34#include "journal-authenticate.h"
cec736d2 35#include "lookup3.h"
807e17f0 36#include "compress.h"
3df3e884 37#include "random-util.h"
cec736d2 38
4a92baf3
LP
39#define DEFAULT_DATA_HASH_TABLE_SIZE (2047ULL*sizeof(HashItem))
40#define DEFAULT_FIELD_HASH_TABLE_SIZE (333ULL*sizeof(HashItem))
cec736d2 41
be19b7df 42#define COMPRESSION_SIZE_THRESHOLD (512ULL)
807e17f0 43
babfc091 44/* This is the minimum journal file size */
253f59df 45#define JOURNAL_FILE_SIZE_MIN (4ULL*1024ULL*1024ULL) /* 4 MiB */
babfc091
LP
46
47/* These are the lower and upper bounds if we deduce the max_use value
48 * from the file system size */
49#define DEFAULT_MAX_USE_LOWER (1ULL*1024ULL*1024ULL) /* 1 MiB */
50#define DEFAULT_MAX_USE_UPPER (4ULL*1024ULL*1024ULL*1024ULL) /* 4 GiB */
51
52/* This is the upper bound if we deduce max_size from max_use */
71100051 53#define DEFAULT_MAX_SIZE_UPPER (128ULL*1024ULL*1024ULL) /* 128 MiB */
babfc091
LP
54
55/* This is the upper bound if we deduce the keep_free value from the
56 * file system size */
57#define DEFAULT_KEEP_FREE_UPPER (4ULL*1024ULL*1024ULL*1024ULL) /* 4 GiB */
58
59/* This is the keep_free value when we can't determine the system
60 * size */
61#define DEFAULT_KEEP_FREE (1024ULL*1024ULL) /* 1 MB */
62
dca6219e
LP
63/* n_data was the first entry we added after the initial file format design */
64#define HEADER_SIZE_MIN ALIGN64(offsetof(Header, n_data))
cec736d2 65
a4bcff5b
LP
66/* How many entries to keep in the entry array chain cache at max */
67#define CHAIN_CACHE_MAX 20
68
a676e665
LP
69/* How much to increase the journal file size at once each time we allocate something new. */
70#define FILE_SIZE_INCREASE (8ULL*1024ULL*1024ULL) /* 8MB */
71
2678031a
LP
72/* Reread fstat() of the file for detecting deletions at least this often */
73#define LAST_STAT_REFRESH_USEC (5*USEC_PER_SEC)
74
fa6ac760
LP
75/* The mmap context to use for the header we pick as one above the last defined typed */
76#define CONTEXT_HEADER _OBJECT_TYPE_MAX
77
9588bc32 78static int journal_file_set_online(JournalFile *f) {
26687bf8
OS
79 assert(f);
80
81 if (!f->writable)
82 return -EPERM;
83
84 if (!(f->fd >= 0 && f->header))
85 return -EINVAL;
86
fa6ac760
LP
87 if (mmap_cache_got_sigbus(f->mmap, f->fd))
88 return -EIO;
89
26687bf8
OS
90 switch(f->header->state) {
91 case STATE_ONLINE:
92 return 0;
93
94 case STATE_OFFLINE:
95 f->header->state = STATE_ONLINE;
96 fsync(f->fd);
97 return 0;
98
99 default:
100 return -EINVAL;
101 }
102}
103
104int journal_file_set_offline(JournalFile *f) {
105 assert(f);
106
107 if (!f->writable)
108 return -EPERM;
109
110 if (!(f->fd >= 0 && f->header))
111 return -EINVAL;
112
113 if (f->header->state != STATE_ONLINE)
114 return 0;
115
116 fsync(f->fd);
117
fa6ac760
LP
118 if (mmap_cache_got_sigbus(f->mmap, f->fd))
119 return -EIO;
120
26687bf8
OS
121 f->header->state = STATE_OFFLINE;
122
fa6ac760
LP
123 if (mmap_cache_got_sigbus(f->mmap, f->fd))
124 return -EIO;
125
26687bf8
OS
126 fsync(f->fd);
127
128 return 0;
129}
130
cec736d2 131void journal_file_close(JournalFile *f) {
de190aef 132 assert(f);
cec736d2 133
feb12d3e 134#ifdef HAVE_GCRYPT
b0af6f41 135 /* Write the final tag */
c586dbf1 136 if (f->seal && f->writable)
b0af6f41 137 journal_file_append_tag(f);
feb12d3e 138#endif
b0af6f41 139
26687bf8 140 journal_file_set_offline(f);
cec736d2 141
fa6ac760
LP
142 if (f->mmap && f->fd >= 0)
143 mmap_cache_close_fd(f->mmap, f->fd);
cec736d2 144
11689d2a
LP
145 if (f->fd >= 0 && f->defrag_on_close) {
146
147 /* Be friendly to btrfs: turn COW back on again now,
148 * and defragment the file. We won't write to the file
149 * ever again, hence remove all fragmentation, and
150 * reenable all the good bits COW usually provides
151 * (such as data checksumming). */
152
1ed8f8c1 153 (void) chattr_fd(f->fd, 0, FS_NOCOW_FL);
11689d2a
LP
154 (void) btrfs_defrag_fd(f->fd);
155 }
f27a3864 156
03e334a1 157 safe_close(f->fd);
cec736d2 158 free(f->path);
807e17f0 159
16e9f408
LP
160 if (f->mmap)
161 mmap_cache_unref(f->mmap);
162
4743015d 163 ordered_hashmap_free_free(f->chain_cache);
a4bcff5b 164
d89c8fdf 165#if defined(HAVE_XZ) || defined(HAVE_LZ4)
807e17f0
LP
166 free(f->compress_buffer);
167#endif
168
7560fffc 169#ifdef HAVE_GCRYPT
baed47c3
LP
170 if (f->fss_file)
171 munmap(f->fss_file, PAGE_ALIGN(f->fss_file_size));
b7c9ae91
LP
172 else if (f->fsprg_state)
173 free(f->fsprg_state);
174
175 free(f->fsprg_seed);
7560fffc
LP
176
177 if (f->hmac)
178 gcry_md_close(f->hmac);
179#endif
180
cec736d2
LP
181 free(f);
182}
183
0ac38b70 184static int journal_file_init_header(JournalFile *f, JournalFile *template) {
d89c8fdf 185 Header h = {};
cec736d2
LP
186 ssize_t k;
187 int r;
188
189 assert(f);
190
7560fffc 191 memcpy(h.signature, HEADER_SIGNATURE, 8);
23b0b2b2 192 h.header_size = htole64(ALIGN64(sizeof(h)));
cec736d2 193
d89c8fdf
ZJS
194 h.incompatible_flags |= htole32(
195 f->compress_xz * HEADER_INCOMPATIBLE_COMPRESSED_XZ |
196 f->compress_lz4 * HEADER_INCOMPATIBLE_COMPRESSED_LZ4);
7560fffc 197
d89c8fdf
ZJS
198 h.compatible_flags = htole32(
199 f->seal * HEADER_COMPATIBLE_SEALED);
7560fffc 200
cec736d2
LP
201 r = sd_id128_randomize(&h.file_id);
202 if (r < 0)
203 return r;
204
0ac38b70
LP
205 if (template) {
206 h.seqnum_id = template->header->seqnum_id;
beec0085 207 h.tail_entry_seqnum = template->header->tail_entry_seqnum;
0ac38b70
LP
208 } else
209 h.seqnum_id = h.file_id;
cec736d2
LP
210
211 k = pwrite(f->fd, &h, sizeof(h), 0);
212 if (k < 0)
213 return -errno;
214
215 if (k != sizeof(h))
216 return -EIO;
217
218 return 0;
219}
220
221static int journal_file_refresh_header(JournalFile *f) {
de190aef 222 sd_id128_t boot_id;
fa6ac760 223 int r;
cec736d2
LP
224
225 assert(f);
226
227 r = sd_id128_get_machine(&f->header->machine_id);
228 if (r < 0)
229 return r;
230
de190aef 231 r = sd_id128_get_boot(&boot_id);
cec736d2
LP
232 if (r < 0)
233 return r;
234
de190aef
LP
235 if (sd_id128_equal(boot_id, f->header->boot_id))
236 f->tail_entry_monotonic_valid = true;
237
238 f->header->boot_id = boot_id;
239
fa6ac760 240 r = journal_file_set_online(f);
b788cc23 241
7560fffc 242 /* Sync the online state to disk */
a676e665 243 fsync(f->fd);
b788cc23 244
fa6ac760 245 return r;
cec736d2
LP
246}
247
248static int journal_file_verify_header(JournalFile *f) {
d89c8fdf
ZJS
249 uint32_t flags;
250
cec736d2
LP
251 assert(f);
252
7560fffc 253 if (memcmp(f->header->signature, HEADER_SIGNATURE, 8))
cec736d2
LP
254 return -EBADMSG;
255
7560fffc
LP
256 /* In both read and write mode we refuse to open files with
257 * incompatible flags we don't know */
d89c8fdf
ZJS
258 flags = le32toh(f->header->incompatible_flags);
259 if (flags & ~HEADER_INCOMPATIBLE_SUPPORTED) {
260 if (flags & ~HEADER_INCOMPATIBLE_ANY)
261 log_debug("Journal file %s has unknown incompatible flags %"PRIx32,
262 f->path, flags & ~HEADER_INCOMPATIBLE_ANY);
263 flags = (flags & HEADER_INCOMPATIBLE_ANY) & ~HEADER_INCOMPATIBLE_SUPPORTED;
264 if (flags)
265 log_debug("Journal file %s uses incompatible flags %"PRIx32
266 " disabled at compilation time.", f->path, flags);
cec736d2 267 return -EPROTONOSUPPORT;
d89c8fdf 268 }
cec736d2 269
7560fffc
LP
270 /* When open for writing we refuse to open files with
271 * compatible flags, too */
d89c8fdf
ZJS
272 flags = le32toh(f->header->compatible_flags);
273 if (f->writable && (flags & ~HEADER_COMPATIBLE_SUPPORTED)) {
274 if (flags & ~HEADER_COMPATIBLE_ANY)
275 log_debug("Journal file %s has unknown compatible flags %"PRIx32,
276 f->path, flags & ~HEADER_COMPATIBLE_ANY);
277 flags = (flags & HEADER_COMPATIBLE_ANY) & ~HEADER_COMPATIBLE_SUPPORTED;
278 if (flags)
279 log_debug("Journal file %s uses compatible flags %"PRIx32
280 " disabled at compilation time.", f->path, flags);
281 return -EPROTONOSUPPORT;
7560fffc
LP
282 }
283
db11ac1a
LP
284 if (f->header->state >= _STATE_MAX)
285 return -EBADMSG;
286
dca6219e
LP
287 /* The first addition was n_data, so check that we are at least this large */
288 if (le64toh(f->header->header_size) < HEADER_SIZE_MIN)
23b0b2b2
LP
289 return -EBADMSG;
290
8088cbd3 291 if (JOURNAL_HEADER_SEALED(f->header) && !JOURNAL_HEADER_CONTAINS(f->header, n_entry_arrays))
beec0085
LP
292 return -EBADMSG;
293
db11ac1a
LP
294 if ((le64toh(f->header->header_size) + le64toh(f->header->arena_size)) > (uint64_t) f->last_stat.st_size)
295 return -ENODATA;
296
297 if (le64toh(f->header->tail_object_offset) > (le64toh(f->header->header_size) + le64toh(f->header->arena_size)))
298 return -ENODATA;
299
7762e02b
LP
300 if (!VALID64(le64toh(f->header->data_hash_table_offset)) ||
301 !VALID64(le64toh(f->header->field_hash_table_offset)) ||
302 !VALID64(le64toh(f->header->tail_object_offset)) ||
303 !VALID64(le64toh(f->header->entry_array_offset)))
304 return -ENODATA;
305
cec736d2 306 if (f->writable) {
ccdbaf91 307 uint8_t state;
cec736d2
LP
308 sd_id128_t machine_id;
309 int r;
310
311 r = sd_id128_get_machine(&machine_id);
312 if (r < 0)
313 return r;
314
315 if (!sd_id128_equal(machine_id, f->header->machine_id))
316 return -EHOSTDOWN;
317
de190aef 318 state = f->header->state;
cec736d2 319
71fa6f00
LP
320 if (state == STATE_ONLINE) {
321 log_debug("Journal file %s is already online. Assuming unclean closing.", f->path);
322 return -EBUSY;
323 } else if (state == STATE_ARCHIVED)
cec736d2 324 return -ESHUTDOWN;
71fa6f00 325 else if (state != STATE_OFFLINE) {
8facc349 326 log_debug("Journal file %s has unknown state %i.", f->path, state);
71fa6f00
LP
327 return -EBUSY;
328 }
cec736d2
LP
329 }
330
d89c8fdf
ZJS
331 f->compress_xz = JOURNAL_HEADER_COMPRESSED_XZ(f->header);
332 f->compress_lz4 = JOURNAL_HEADER_COMPRESSED_LZ4(f->header);
c586dbf1 333
f1889c91 334 f->seal = JOURNAL_HEADER_SEALED(f->header);
7560fffc 335
cec736d2
LP
336 return 0;
337}
338
2678031a
LP
339static int journal_file_fstat(JournalFile *f) {
340 assert(f);
341 assert(f->fd >= 0);
342
343 if (fstat(f->fd, &f->last_stat) < 0)
344 return -errno;
345
346 f->last_stat_usec = now(CLOCK_MONOTONIC);
347
348 /* Refuse appending to files that are already deleted */
349 if (f->last_stat.st_nlink <= 0)
350 return -EIDRM;
351
352 return 0;
353}
354
cec736d2 355static int journal_file_allocate(JournalFile *f, uint64_t offset, uint64_t size) {
eda4b58b 356 uint64_t old_size, new_size;
fec2aa2f 357 int r;
cec736d2
LP
358
359 assert(f);
360
cec736d2 361 /* We assume that this file is not sparse, and we know that
38ac38b2 362 * for sure, since we always call posix_fallocate()
cec736d2
LP
363 * ourselves */
364
fa6ac760
LP
365 if (mmap_cache_got_sigbus(f->mmap, f->fd))
366 return -EIO;
367
cec736d2 368 old_size =
23b0b2b2 369 le64toh(f->header->header_size) +
cec736d2
LP
370 le64toh(f->header->arena_size);
371
bc85bfee 372 new_size = PAGE_ALIGN(offset + size);
23b0b2b2
LP
373 if (new_size < le64toh(f->header->header_size))
374 new_size = le64toh(f->header->header_size);
bc85bfee 375
2678031a
LP
376 if (new_size <= old_size) {
377
378 /* We already pre-allocated enough space, but before
379 * we write to it, let's check with fstat() if the
380 * file got deleted, in order make sure we don't throw
381 * away the data immediately. Don't check fstat() for
382 * all writes though, but only once ever 10s. */
383
384 if (f->last_stat_usec + LAST_STAT_REFRESH_USEC > now(CLOCK_MONOTONIC))
385 return 0;
386
387 return journal_file_fstat(f);
388 }
389
390 /* Allocate more space. */
cec736d2 391
a676e665 392 if (f->metrics.max_size > 0 && new_size > f->metrics.max_size)
bc85bfee 393 return -E2BIG;
cec736d2 394
a676e665 395 if (new_size > f->metrics.min_size && f->metrics.keep_free > 0) {
cec736d2
LP
396 struct statvfs svfs;
397
398 if (fstatvfs(f->fd, &svfs) >= 0) {
399 uint64_t available;
400
401 available = svfs.f_bfree * svfs.f_bsize;
402
bc85bfee
LP
403 if (available >= f->metrics.keep_free)
404 available -= f->metrics.keep_free;
cec736d2
LP
405 else
406 available = 0;
407
408 if (new_size - old_size > available)
409 return -E2BIG;
410 }
411 }
412
eda4b58b
LP
413 /* Increase by larger blocks at once */
414 new_size = ((new_size+FILE_SIZE_INCREASE-1) / FILE_SIZE_INCREASE) * FILE_SIZE_INCREASE;
415 if (f->metrics.max_size > 0 && new_size > f->metrics.max_size)
416 new_size = f->metrics.max_size;
417
bc85bfee
LP
418 /* Note that the glibc fallocate() fallback is very
419 inefficient, hence we try to minimize the allocation area
420 as we can. */
fec2aa2f
GV
421 r = posix_fallocate(f->fd, old_size, new_size - old_size);
422 if (r != 0)
423 return -r;
cec736d2 424
23b0b2b2 425 f->header->arena_size = htole64(new_size - le64toh(f->header->header_size));
cec736d2 426
2678031a 427 return journal_file_fstat(f);
cec736d2
LP
428}
429
78519831 430static unsigned type_to_context(ObjectType type) {
d3d3208f 431 /* One context for each type, plus one catch-all for the rest */
69adae51 432 assert_cc(_OBJECT_TYPE_MAX <= MMAP_CACHE_MAX_CONTEXTS);
fa6ac760 433 assert_cc(CONTEXT_HEADER < MMAP_CACHE_MAX_CONTEXTS);
d05089d8 434 return type > OBJECT_UNUSED && type < _OBJECT_TYPE_MAX ? type : 0;
d3d3208f
MS
435}
436
7a9dabea 437static int journal_file_move_to(JournalFile *f, ObjectType type, bool keep_always, uint64_t offset, uint64_t size, void **ret) {
2678031a
LP
438 int r;
439
cec736d2 440 assert(f);
cec736d2
LP
441 assert(ret);
442
7762e02b
LP
443 if (size <= 0)
444 return -EINVAL;
445
2a59ea54 446 /* Avoid SIGBUS on invalid accesses */
4bbdcdb3
LP
447 if (offset + size > (uint64_t) f->last_stat.st_size) {
448 /* Hmm, out of range? Let's refresh the fstat() data
449 * first, before we trust that check. */
450
2678031a
LP
451 r = journal_file_fstat(f);
452 if (r < 0)
453 return r;
454
455 if (offset + size > (uint64_t) f->last_stat.st_size)
4bbdcdb3
LP
456 return -EADDRNOTAVAIL;
457 }
458
7a9dabea 459 return mmap_cache_get(f->mmap, f->fd, f->prot, type_to_context(type), keep_always, offset, size, &f->last_stat, ret);
cec736d2
LP
460}
461
16e9f408
LP
462static uint64_t minimum_header_size(Object *o) {
463
b8e891e6 464 static const uint64_t table[] = {
16e9f408
LP
465 [OBJECT_DATA] = sizeof(DataObject),
466 [OBJECT_FIELD] = sizeof(FieldObject),
467 [OBJECT_ENTRY] = sizeof(EntryObject),
468 [OBJECT_DATA_HASH_TABLE] = sizeof(HashTableObject),
469 [OBJECT_FIELD_HASH_TABLE] = sizeof(HashTableObject),
470 [OBJECT_ENTRY_ARRAY] = sizeof(EntryArrayObject),
471 [OBJECT_TAG] = sizeof(TagObject),
472 };
473
474 if (o->object.type >= ELEMENTSOF(table) || table[o->object.type] <= 0)
475 return sizeof(ObjectHeader);
476
477 return table[o->object.type];
478}
479
78519831 480int journal_file_move_to_object(JournalFile *f, ObjectType type, uint64_t offset, Object **ret) {
cec736d2
LP
481 int r;
482 void *t;
483 Object *o;
484 uint64_t s;
485
486 assert(f);
487 assert(ret);
488
db11ac1a
LP
489 /* Objects may only be located at multiple of 64 bit */
490 if (!VALID64(offset))
491 return -EFAULT;
492
7a9dabea 493 r = journal_file_move_to(f, type, false, offset, sizeof(ObjectHeader), &t);
cec736d2
LP
494 if (r < 0)
495 return r;
496
497 o = (Object*) t;
498 s = le64toh(o->object.size);
499
500 if (s < sizeof(ObjectHeader))
501 return -EBADMSG;
502
16e9f408
LP
503 if (o->object.type <= OBJECT_UNUSED)
504 return -EBADMSG;
505
506 if (s < minimum_header_size(o))
507 return -EBADMSG;
508
d05089d8 509 if (type > OBJECT_UNUSED && o->object.type != type)
cec736d2
LP
510 return -EBADMSG;
511
512 if (s > sizeof(ObjectHeader)) {
7a9dabea 513 r = journal_file_move_to(f, type, false, offset, s, &t);
cec736d2
LP
514 if (r < 0)
515 return r;
516
517 o = (Object*) t;
518 }
519
cec736d2
LP
520 *ret = o;
521 return 0;
522}
523
d98cc1f2 524static uint64_t journal_file_entry_seqnum(JournalFile *f, uint64_t *seqnum) {
cec736d2
LP
525 uint64_t r;
526
527 assert(f);
528
beec0085 529 r = le64toh(f->header->tail_entry_seqnum) + 1;
c2373f84
LP
530
531 if (seqnum) {
de190aef 532 /* If an external seqnum counter was passed, we update
c2373f84
LP
533 * both the local and the external one, and set it to
534 * the maximum of both */
535
536 if (*seqnum + 1 > r)
537 r = *seqnum + 1;
538
539 *seqnum = r;
540 }
541
beec0085 542 f->header->tail_entry_seqnum = htole64(r);
cec736d2 543
beec0085
LP
544 if (f->header->head_entry_seqnum == 0)
545 f->header->head_entry_seqnum = htole64(r);
de190aef 546
cec736d2
LP
547 return r;
548}
549
78519831 550int journal_file_append_object(JournalFile *f, ObjectType type, uint64_t size, Object **ret, uint64_t *offset) {
cec736d2
LP
551 int r;
552 uint64_t p;
553 Object *tail, *o;
554 void *t;
555
556 assert(f);
d05089d8 557 assert(type > OBJECT_UNUSED && type < _OBJECT_TYPE_MAX);
cec736d2
LP
558 assert(size >= sizeof(ObjectHeader));
559 assert(offset);
560 assert(ret);
561
26687bf8
OS
562 r = journal_file_set_online(f);
563 if (r < 0)
564 return r;
565
cec736d2 566 p = le64toh(f->header->tail_object_offset);
cec736d2 567 if (p == 0)
23b0b2b2 568 p = le64toh(f->header->header_size);
cec736d2 569 else {
d05089d8 570 r = journal_file_move_to_object(f, OBJECT_UNUSED, p, &tail);
cec736d2
LP
571 if (r < 0)
572 return r;
573
574 p += ALIGN64(le64toh(tail->object.size));
575 }
576
577 r = journal_file_allocate(f, p, size);
578 if (r < 0)
579 return r;
580
fcde2389 581 r = journal_file_move_to(f, type, false, p, size, &t);
cec736d2
LP
582 if (r < 0)
583 return r;
584
585 o = (Object*) t;
586
587 zero(o->object);
de190aef 588 o->object.type = type;
cec736d2
LP
589 o->object.size = htole64(size);
590
591 f->header->tail_object_offset = htole64(p);
cec736d2
LP
592 f->header->n_objects = htole64(le64toh(f->header->n_objects) + 1);
593
594 *ret = o;
595 *offset = p;
596
597 return 0;
598}
599
de190aef 600static int journal_file_setup_data_hash_table(JournalFile *f) {
cec736d2
LP
601 uint64_t s, p;
602 Object *o;
603 int r;
604
605 assert(f);
606
dfabe643 607 /* We estimate that we need 1 hash table entry per 768 of
4a92baf3
LP
608 journal file and we want to make sure we never get beyond
609 75% fill level. Calculate the hash table size for the
610 maximum file size based on these metrics. */
611
dfabe643 612 s = (f->metrics.max_size * 4 / 768 / 3) * sizeof(HashItem);
4a92baf3
LP
613 if (s < DEFAULT_DATA_HASH_TABLE_SIZE)
614 s = DEFAULT_DATA_HASH_TABLE_SIZE;
615
507f22bd 616 log_debug("Reserving %"PRIu64" entries in hash table.", s / sizeof(HashItem));
4a92baf3 617
de190aef
LP
618 r = journal_file_append_object(f,
619 OBJECT_DATA_HASH_TABLE,
620 offsetof(Object, hash_table.items) + s,
621 &o, &p);
cec736d2
LP
622 if (r < 0)
623 return r;
624
29804cc1 625 memzero(o->hash_table.items, s);
cec736d2 626
de190aef
LP
627 f->header->data_hash_table_offset = htole64(p + offsetof(Object, hash_table.items));
628 f->header->data_hash_table_size = htole64(s);
cec736d2
LP
629
630 return 0;
631}
632
de190aef 633static int journal_file_setup_field_hash_table(JournalFile *f) {
cec736d2
LP
634 uint64_t s, p;
635 Object *o;
636 int r;
637
638 assert(f);
639
3c1668da
LP
640 /* We use a fixed size hash table for the fields as this
641 * number should grow very slowly only */
642
de190aef
LP
643 s = DEFAULT_FIELD_HASH_TABLE_SIZE;
644 r = journal_file_append_object(f,
645 OBJECT_FIELD_HASH_TABLE,
646 offsetof(Object, hash_table.items) + s,
647 &o, &p);
cec736d2
LP
648 if (r < 0)
649 return r;
650
29804cc1 651 memzero(o->hash_table.items, s);
cec736d2 652
de190aef
LP
653 f->header->field_hash_table_offset = htole64(p + offsetof(Object, hash_table.items));
654 f->header->field_hash_table_size = htole64(s);
cec736d2
LP
655
656 return 0;
657}
658
de190aef 659static int journal_file_map_data_hash_table(JournalFile *f) {
cec736d2
LP
660 uint64_t s, p;
661 void *t;
662 int r;
663
664 assert(f);
665
de190aef
LP
666 p = le64toh(f->header->data_hash_table_offset);
667 s = le64toh(f->header->data_hash_table_size);
cec736d2 668
de190aef 669 r = journal_file_move_to(f,
16e9f408 670 OBJECT_DATA_HASH_TABLE,
fcde2389 671 true,
de190aef
LP
672 p, s,
673 &t);
cec736d2
LP
674 if (r < 0)
675 return r;
676
de190aef 677 f->data_hash_table = t;
cec736d2
LP
678 return 0;
679}
680
de190aef 681static int journal_file_map_field_hash_table(JournalFile *f) {
cec736d2
LP
682 uint64_t s, p;
683 void *t;
684 int r;
685
686 assert(f);
687
de190aef
LP
688 p = le64toh(f->header->field_hash_table_offset);
689 s = le64toh(f->header->field_hash_table_size);
cec736d2 690
de190aef 691 r = journal_file_move_to(f,
16e9f408 692 OBJECT_FIELD_HASH_TABLE,
fcde2389 693 true,
de190aef
LP
694 p, s,
695 &t);
cec736d2
LP
696 if (r < 0)
697 return r;
698
de190aef 699 f->field_hash_table = t;
cec736d2
LP
700 return 0;
701}
702
3c1668da
LP
703static int journal_file_link_field(
704 JournalFile *f,
705 Object *o,
706 uint64_t offset,
707 uint64_t hash) {
708
805d1486 709 uint64_t p, h, m;
3c1668da
LP
710 int r;
711
712 assert(f);
713 assert(o);
714 assert(offset > 0);
715
716 if (o->object.type != OBJECT_FIELD)
717 return -EINVAL;
718
805d1486
LP
719 m = le64toh(f->header->field_hash_table_size) / sizeof(HashItem);
720 if (m <= 0)
721 return -EBADMSG;
3c1668da 722
805d1486 723 /* This might alter the window we are looking at */
3c1668da
LP
724 o->field.next_hash_offset = o->field.head_data_offset = 0;
725
805d1486 726 h = hash % m;
3c1668da
LP
727 p = le64toh(f->field_hash_table[h].tail_hash_offset);
728 if (p == 0)
729 f->field_hash_table[h].head_hash_offset = htole64(offset);
730 else {
731 r = journal_file_move_to_object(f, OBJECT_FIELD, p, &o);
732 if (r < 0)
733 return r;
734
735 o->field.next_hash_offset = htole64(offset);
736 }
737
738 f->field_hash_table[h].tail_hash_offset = htole64(offset);
739
740 if (JOURNAL_HEADER_CONTAINS(f->header, n_fields))
741 f->header->n_fields = htole64(le64toh(f->header->n_fields) + 1);
742
743 return 0;
744}
745
746static int journal_file_link_data(
747 JournalFile *f,
748 Object *o,
749 uint64_t offset,
750 uint64_t hash) {
751
805d1486 752 uint64_t p, h, m;
cec736d2
LP
753 int r;
754
755 assert(f);
756 assert(o);
757 assert(offset > 0);
b588975f
LP
758
759 if (o->object.type != OBJECT_DATA)
760 return -EINVAL;
cec736d2 761
805d1486
LP
762 m = le64toh(f->header->data_hash_table_size) / sizeof(HashItem);
763 if (m <= 0)
764 return -EBADMSG;
48496df6 765
805d1486 766 /* This might alter the window we are looking at */
de190aef
LP
767 o->data.next_hash_offset = o->data.next_field_offset = 0;
768 o->data.entry_offset = o->data.entry_array_offset = 0;
769 o->data.n_entries = 0;
cec736d2 770
805d1486 771 h = hash % m;
8db4213e 772 p = le64toh(f->data_hash_table[h].tail_hash_offset);
3c1668da 773 if (p == 0)
cec736d2 774 /* Only entry in the hash table is easy */
de190aef 775 f->data_hash_table[h].head_hash_offset = htole64(offset);
3c1668da 776 else {
48496df6
LP
777 /* Move back to the previous data object, to patch in
778 * pointer */
cec736d2 779
de190aef 780 r = journal_file_move_to_object(f, OBJECT_DATA, p, &o);
cec736d2
LP
781 if (r < 0)
782 return r;
783
de190aef 784 o->data.next_hash_offset = htole64(offset);
cec736d2
LP
785 }
786
de190aef 787 f->data_hash_table[h].tail_hash_offset = htole64(offset);
cec736d2 788
dca6219e
LP
789 if (JOURNAL_HEADER_CONTAINS(f->header, n_data))
790 f->header->n_data = htole64(le64toh(f->header->n_data) + 1);
791
cec736d2
LP
792 return 0;
793}
794
3c1668da
LP
795int journal_file_find_field_object_with_hash(
796 JournalFile *f,
797 const void *field, uint64_t size, uint64_t hash,
798 Object **ret, uint64_t *offset) {
799
805d1486 800 uint64_t p, osize, h, m;
3c1668da
LP
801 int r;
802
803 assert(f);
804 assert(field && size > 0);
805
806 osize = offsetof(Object, field.payload) + size;
807
805d1486
LP
808 m = le64toh(f->header->field_hash_table_size) / sizeof(HashItem);
809
810 if (m <= 0)
3c1668da
LP
811 return -EBADMSG;
812
805d1486 813 h = hash % m;
3c1668da
LP
814 p = le64toh(f->field_hash_table[h].head_hash_offset);
815
816 while (p > 0) {
817 Object *o;
818
819 r = journal_file_move_to_object(f, OBJECT_FIELD, p, &o);
820 if (r < 0)
821 return r;
822
823 if (le64toh(o->field.hash) == hash &&
824 le64toh(o->object.size) == osize &&
825 memcmp(o->field.payload, field, size) == 0) {
826
827 if (ret)
828 *ret = o;
829 if (offset)
830 *offset = p;
831
832 return 1;
833 }
834
835 p = le64toh(o->field.next_hash_offset);
836 }
837
838 return 0;
839}
840
841int journal_file_find_field_object(
842 JournalFile *f,
843 const void *field, uint64_t size,
844 Object **ret, uint64_t *offset) {
845
846 uint64_t hash;
847
848 assert(f);
849 assert(field && size > 0);
850
851 hash = hash64(field, size);
852
853 return journal_file_find_field_object_with_hash(f,
854 field, size, hash,
855 ret, offset);
856}
857
de190aef
LP
858int journal_file_find_data_object_with_hash(
859 JournalFile *f,
860 const void *data, uint64_t size, uint64_t hash,
861 Object **ret, uint64_t *offset) {
48496df6 862
805d1486 863 uint64_t p, osize, h, m;
cec736d2
LP
864 int r;
865
866 assert(f);
867 assert(data || size == 0);
868
869 osize = offsetof(Object, data.payload) + size;
870
805d1486
LP
871 m = le64toh(f->header->data_hash_table_size) / sizeof(HashItem);
872 if (m <= 0)
bc85bfee
LP
873 return -EBADMSG;
874
805d1486 875 h = hash % m;
de190aef 876 p = le64toh(f->data_hash_table[h].head_hash_offset);
cec736d2 877
de190aef
LP
878 while (p > 0) {
879 Object *o;
cec736d2 880
de190aef 881 r = journal_file_move_to_object(f, OBJECT_DATA, p, &o);
cec736d2
LP
882 if (r < 0)
883 return r;
884
807e17f0 885 if (le64toh(o->data.hash) != hash)
85a131e8 886 goto next;
807e17f0 887
d89c8fdf 888 if (o->object.flags & OBJECT_COMPRESSION_MASK) {
3b1a55e1 889#if defined(HAVE_XZ) || defined(HAVE_LZ4)
fa1c4b51 890 uint64_t l;
a7f7d1bd 891 size_t rsize = 0;
cec736d2 892
807e17f0
LP
893 l = le64toh(o->object.size);
894 if (l <= offsetof(Object, data.payload))
cec736d2
LP
895 return -EBADMSG;
896
807e17f0
LP
897 l -= offsetof(Object, data.payload);
898
d89c8fdf
ZJS
899 r = decompress_blob(o->object.flags & OBJECT_COMPRESSION_MASK,
900 o->data.payload, l, &f->compress_buffer, &f->compress_buffer_size, &rsize, 0);
901 if (r < 0)
902 return r;
807e17f0 903
b785c858 904 if (rsize == size &&
807e17f0
LP
905 memcmp(f->compress_buffer, data, size) == 0) {
906
907 if (ret)
908 *ret = o;
909
910 if (offset)
911 *offset = p;
912
913 return 1;
914 }
3b1a55e1
ZJS
915#else
916 return -EPROTONOSUPPORT;
917#endif
807e17f0
LP
918 } else if (le64toh(o->object.size) == osize &&
919 memcmp(o->data.payload, data, size) == 0) {
920
cec736d2
LP
921 if (ret)
922 *ret = o;
923
924 if (offset)
925 *offset = p;
926
de190aef 927 return 1;
cec736d2
LP
928 }
929
85a131e8 930 next:
cec736d2
LP
931 p = le64toh(o->data.next_hash_offset);
932 }
933
de190aef
LP
934 return 0;
935}
936
937int journal_file_find_data_object(
938 JournalFile *f,
939 const void *data, uint64_t size,
940 Object **ret, uint64_t *offset) {
941
942 uint64_t hash;
943
944 assert(f);
945 assert(data || size == 0);
946
947 hash = hash64(data, size);
948
949 return journal_file_find_data_object_with_hash(f,
950 data, size, hash,
951 ret, offset);
952}
953
3c1668da
LP
954static int journal_file_append_field(
955 JournalFile *f,
956 const void *field, uint64_t size,
957 Object **ret, uint64_t *offset) {
958
959 uint64_t hash, p;
960 uint64_t osize;
961 Object *o;
962 int r;
963
964 assert(f);
965 assert(field && size > 0);
966
967 hash = hash64(field, size);
968
969 r = journal_file_find_field_object_with_hash(f, field, size, hash, &o, &p);
970 if (r < 0)
971 return r;
972 else if (r > 0) {
973
974 if (ret)
975 *ret = o;
976
977 if (offset)
978 *offset = p;
979
980 return 0;
981 }
982
983 osize = offsetof(Object, field.payload) + size;
984 r = journal_file_append_object(f, OBJECT_FIELD, osize, &o, &p);
8c92d4bb
LP
985 if (r < 0)
986 return r;
3c1668da
LP
987
988 o->field.hash = htole64(hash);
989 memcpy(o->field.payload, field, size);
990
991 r = journal_file_link_field(f, o, p, hash);
992 if (r < 0)
993 return r;
994
995 /* The linking might have altered the window, so let's
996 * refresh our pointer */
997 r = journal_file_move_to_object(f, OBJECT_FIELD, p, &o);
998 if (r < 0)
999 return r;
1000
1001#ifdef HAVE_GCRYPT
1002 r = journal_file_hmac_put_object(f, OBJECT_FIELD, o, p);
1003 if (r < 0)
1004 return r;
1005#endif
1006
1007 if (ret)
1008 *ret = o;
1009
1010 if (offset)
1011 *offset = p;
1012
1013 return 0;
1014}
1015
48496df6
LP
1016static int journal_file_append_data(
1017 JournalFile *f,
1018 const void *data, uint64_t size,
1019 Object **ret, uint64_t *offset) {
1020
de190aef
LP
1021 uint64_t hash, p;
1022 uint64_t osize;
1023 Object *o;
d89c8fdf 1024 int r, compression = 0;
3c1668da 1025 const void *eq;
de190aef
LP
1026
1027 assert(f);
1028 assert(data || size == 0);
1029
1030 hash = hash64(data, size);
1031
1032 r = journal_file_find_data_object_with_hash(f, data, size, hash, &o, &p);
1033 if (r < 0)
1034 return r;
1035 else if (r > 0) {
1036
1037 if (ret)
1038 *ret = o;
1039
1040 if (offset)
1041 *offset = p;
1042
1043 return 0;
1044 }
1045
1046 osize = offsetof(Object, data.payload) + size;
1047 r = journal_file_append_object(f, OBJECT_DATA, osize, &o, &p);
cec736d2
LP
1048 if (r < 0)
1049 return r;
1050
cec736d2 1051 o->data.hash = htole64(hash);
807e17f0 1052
d89c8fdf
ZJS
1053#if defined(HAVE_XZ) || defined(HAVE_LZ4)
1054 if (f->compress_xz &&
807e17f0 1055 size >= COMPRESSION_SIZE_THRESHOLD) {
a7f7d1bd 1056 size_t rsize = 0;
807e17f0 1057
d89c8fdf 1058 compression = compress_blob(data, size, o->data.payload, &rsize);
807e17f0 1059
d89c8fdf 1060 if (compression) {
807e17f0 1061 o->object.size = htole64(offsetof(Object, data.payload) + rsize);
d89c8fdf 1062 o->object.flags |= compression;
807e17f0 1063
fa1c4b51 1064 log_debug("Compressed data object %"PRIu64" -> %zu using %s",
d89c8fdf 1065 size, rsize, object_compressed_to_string(compression));
807e17f0
LP
1066 }
1067 }
1068#endif
1069
d89c8fdf 1070 if (!compression && size > 0)
807e17f0 1071 memcpy(o->data.payload, data, size);
cec736d2 1072
de190aef 1073 r = journal_file_link_data(f, o, p, hash);
cec736d2
LP
1074 if (r < 0)
1075 return r;
1076
48496df6
LP
1077 /* The linking might have altered the window, so let's
1078 * refresh our pointer */
1079 r = journal_file_move_to_object(f, OBJECT_DATA, p, &o);
1080 if (r < 0)
1081 return r;
1082
08c6f819
SL
1083 if (!data)
1084 eq = NULL;
1085 else
1086 eq = memchr(data, '=', size);
3c1668da 1087 if (eq && eq > data) {
748db592 1088 Object *fo = NULL;
3c1668da 1089 uint64_t fp;
3c1668da
LP
1090
1091 /* Create field object ... */
1092 r = journal_file_append_field(f, data, (uint8_t*) eq - (uint8_t*) data, &fo, &fp);
1093 if (r < 0)
1094 return r;
1095
1096 /* ... and link it in. */
1097 o->data.next_field_offset = fo->field.head_data_offset;
1098 fo->field.head_data_offset = le64toh(p);
1099 }
1100
5996c7c2
LP
1101#ifdef HAVE_GCRYPT
1102 r = journal_file_hmac_put_object(f, OBJECT_DATA, o, p);
1103 if (r < 0)
1104 return r;
1105#endif
1106
cec736d2
LP
1107 if (ret)
1108 *ret = o;
1109
1110 if (offset)
de190aef 1111 *offset = p;
cec736d2
LP
1112
1113 return 0;
1114}
1115
1116uint64_t journal_file_entry_n_items(Object *o) {
1117 assert(o);
b588975f
LP
1118
1119 if (o->object.type != OBJECT_ENTRY)
1120 return 0;
cec736d2
LP
1121
1122 return (le64toh(o->object.size) - offsetof(Object, entry.items)) / sizeof(EntryItem);
1123}
1124
0284adc6 1125uint64_t journal_file_entry_array_n_items(Object *o) {
de190aef 1126 assert(o);
b588975f
LP
1127
1128 if (o->object.type != OBJECT_ENTRY_ARRAY)
1129 return 0;
de190aef
LP
1130
1131 return (le64toh(o->object.size) - offsetof(Object, entry_array.items)) / sizeof(uint64_t);
1132}
1133
fb9a24b6
LP
1134uint64_t journal_file_hash_table_n_items(Object *o) {
1135 assert(o);
b588975f
LP
1136
1137 if (o->object.type != OBJECT_DATA_HASH_TABLE &&
1138 o->object.type != OBJECT_FIELD_HASH_TABLE)
1139 return 0;
fb9a24b6
LP
1140
1141 return (le64toh(o->object.size) - offsetof(Object, hash_table.items)) / sizeof(HashItem);
1142}
1143
de190aef 1144static int link_entry_into_array(JournalFile *f,
4fd052ae
FC
1145 le64_t *first,
1146 le64_t *idx,
de190aef 1147 uint64_t p) {
cec736d2 1148 int r;
de190aef
LP
1149 uint64_t n = 0, ap = 0, q, i, a, hidx;
1150 Object *o;
1151
cec736d2 1152 assert(f);
de190aef
LP
1153 assert(first);
1154 assert(idx);
1155 assert(p > 0);
cec736d2 1156
de190aef
LP
1157 a = le64toh(*first);
1158 i = hidx = le64toh(*idx);
1159 while (a > 0) {
1160
1161 r = journal_file_move_to_object(f, OBJECT_ENTRY_ARRAY, a, &o);
1162 if (r < 0)
1163 return r;
cec736d2 1164
de190aef
LP
1165 n = journal_file_entry_array_n_items(o);
1166 if (i < n) {
1167 o->entry_array.items[i] = htole64(p);
1168 *idx = htole64(hidx + 1);
1169 return 0;
1170 }
cec736d2 1171
de190aef
LP
1172 i -= n;
1173 ap = a;
1174 a = le64toh(o->entry_array.next_entry_array_offset);
1175 }
1176
1177 if (hidx > n)
1178 n = (hidx+1) * 2;
1179 else
1180 n = n * 2;
1181
1182 if (n < 4)
1183 n = 4;
1184
1185 r = journal_file_append_object(f, OBJECT_ENTRY_ARRAY,
1186 offsetof(Object, entry_array.items) + n * sizeof(uint64_t),
1187 &o, &q);
cec736d2
LP
1188 if (r < 0)
1189 return r;
1190
feb12d3e 1191#ifdef HAVE_GCRYPT
5996c7c2 1192 r = journal_file_hmac_put_object(f, OBJECT_ENTRY_ARRAY, o, q);
b0af6f41
LP
1193 if (r < 0)
1194 return r;
feb12d3e 1195#endif
b0af6f41 1196
de190aef 1197 o->entry_array.items[i] = htole64(p);
cec736d2 1198
de190aef 1199 if (ap == 0)
7be3aa17 1200 *first = htole64(q);
cec736d2 1201 else {
de190aef 1202 r = journal_file_move_to_object(f, OBJECT_ENTRY_ARRAY, ap, &o);
cec736d2
LP
1203 if (r < 0)
1204 return r;
1205
de190aef
LP
1206 o->entry_array.next_entry_array_offset = htole64(q);
1207 }
cec736d2 1208
2dee23eb
LP
1209 if (JOURNAL_HEADER_CONTAINS(f->header, n_entry_arrays))
1210 f->header->n_entry_arrays = htole64(le64toh(f->header->n_entry_arrays) + 1);
1211
de190aef
LP
1212 *idx = htole64(hidx + 1);
1213
1214 return 0;
1215}
cec736d2 1216
de190aef 1217static int link_entry_into_array_plus_one(JournalFile *f,
4fd052ae
FC
1218 le64_t *extra,
1219 le64_t *first,
1220 le64_t *idx,
de190aef
LP
1221 uint64_t p) {
1222
1223 int r;
1224
1225 assert(f);
1226 assert(extra);
1227 assert(first);
1228 assert(idx);
1229 assert(p > 0);
1230
1231 if (*idx == 0)
1232 *extra = htole64(p);
1233 else {
4fd052ae 1234 le64_t i;
de190aef 1235
7be3aa17 1236 i = htole64(le64toh(*idx) - 1);
de190aef
LP
1237 r = link_entry_into_array(f, first, &i, p);
1238 if (r < 0)
1239 return r;
cec736d2
LP
1240 }
1241
de190aef
LP
1242 *idx = htole64(le64toh(*idx) + 1);
1243 return 0;
1244}
1245
1246static int journal_file_link_entry_item(JournalFile *f, Object *o, uint64_t offset, uint64_t i) {
1247 uint64_t p;
1248 int r;
1249 assert(f);
1250 assert(o);
1251 assert(offset > 0);
1252
1253 p = le64toh(o->entry.items[i].object_offset);
1254 if (p == 0)
1255 return -EINVAL;
1256
1257 r = journal_file_move_to_object(f, OBJECT_DATA, p, &o);
cec736d2
LP
1258 if (r < 0)
1259 return r;
1260
de190aef
LP
1261 return link_entry_into_array_plus_one(f,
1262 &o->data.entry_offset,
1263 &o->data.entry_array_offset,
1264 &o->data.n_entries,
1265 offset);
cec736d2
LP
1266}
1267
1268static int journal_file_link_entry(JournalFile *f, Object *o, uint64_t offset) {
de190aef 1269 uint64_t n, i;
cec736d2
LP
1270 int r;
1271
1272 assert(f);
1273 assert(o);
1274 assert(offset > 0);
b588975f
LP
1275
1276 if (o->object.type != OBJECT_ENTRY)
1277 return -EINVAL;
cec736d2 1278
b788cc23
LP
1279 __sync_synchronize();
1280
cec736d2 1281 /* Link up the entry itself */
de190aef
LP
1282 r = link_entry_into_array(f,
1283 &f->header->entry_array_offset,
1284 &f->header->n_entries,
1285 offset);
1286 if (r < 0)
1287 return r;
cec736d2 1288
507f22bd 1289 /* log_debug("=> %s seqnr=%"PRIu64" n_entries=%"PRIu64, f->path, o->entry.seqnum, f->header->n_entries); */
cec736d2 1290
de190aef 1291 if (f->header->head_entry_realtime == 0)
0ac38b70 1292 f->header->head_entry_realtime = o->entry.realtime;
cec736d2 1293
0ac38b70 1294 f->header->tail_entry_realtime = o->entry.realtime;
de190aef
LP
1295 f->header->tail_entry_monotonic = o->entry.monotonic;
1296
1297 f->tail_entry_monotonic_valid = true;
cec736d2
LP
1298
1299 /* Link up the items */
1300 n = journal_file_entry_n_items(o);
1301 for (i = 0; i < n; i++) {
1302 r = journal_file_link_entry_item(f, o, offset, i);
1303 if (r < 0)
1304 return r;
1305 }
1306
cec736d2
LP
1307 return 0;
1308}
1309
1310static int journal_file_append_entry_internal(
1311 JournalFile *f,
1312 const dual_timestamp *ts,
1313 uint64_t xor_hash,
1314 const EntryItem items[], unsigned n_items,
de190aef 1315 uint64_t *seqnum,
cec736d2
LP
1316 Object **ret, uint64_t *offset) {
1317 uint64_t np;
1318 uint64_t osize;
1319 Object *o;
1320 int r;
1321
1322 assert(f);
1323 assert(items || n_items == 0);
de190aef 1324 assert(ts);
cec736d2
LP
1325
1326 osize = offsetof(Object, entry.items) + (n_items * sizeof(EntryItem));
1327
de190aef 1328 r = journal_file_append_object(f, OBJECT_ENTRY, osize, &o, &np);
cec736d2
LP
1329 if (r < 0)
1330 return r;
1331
d98cc1f2 1332 o->entry.seqnum = htole64(journal_file_entry_seqnum(f, seqnum));
cec736d2 1333 memcpy(o->entry.items, items, n_items * sizeof(EntryItem));
de190aef
LP
1334 o->entry.realtime = htole64(ts->realtime);
1335 o->entry.monotonic = htole64(ts->monotonic);
cec736d2
LP
1336 o->entry.xor_hash = htole64(xor_hash);
1337 o->entry.boot_id = f->header->boot_id;
1338
feb12d3e 1339#ifdef HAVE_GCRYPT
5996c7c2 1340 r = journal_file_hmac_put_object(f, OBJECT_ENTRY, o, np);
b0af6f41
LP
1341 if (r < 0)
1342 return r;
feb12d3e 1343#endif
b0af6f41 1344
cec736d2
LP
1345 r = journal_file_link_entry(f, o, np);
1346 if (r < 0)
1347 return r;
1348
1349 if (ret)
1350 *ret = o;
1351
1352 if (offset)
1353 *offset = np;
1354
1355 return 0;
1356}
1357
cf244689 1358void journal_file_post_change(JournalFile *f) {
50f20cfd
LP
1359 assert(f);
1360
1361 /* inotify() does not receive IN_MODIFY events from file
1362 * accesses done via mmap(). After each access we hence
1363 * trigger IN_MODIFY by truncating the journal file to its
1364 * current size which triggers IN_MODIFY. */
1365
bc85bfee
LP
1366 __sync_synchronize();
1367
50f20cfd 1368 if (ftruncate(f->fd, f->last_stat.st_size) < 0)
56f64d95 1369 log_error_errno(errno, "Failed to truncate file to its own size: %m");
50f20cfd
LP
1370}
1371
1f2da9ec
LP
1372static int entry_item_cmp(const void *_a, const void *_b) {
1373 const EntryItem *a = _a, *b = _b;
1374
1375 if (le64toh(a->object_offset) < le64toh(b->object_offset))
1376 return -1;
1377 if (le64toh(a->object_offset) > le64toh(b->object_offset))
1378 return 1;
1379 return 0;
1380}
1381
de190aef 1382int journal_file_append_entry(JournalFile *f, const dual_timestamp *ts, const struct iovec iovec[], unsigned n_iovec, uint64_t *seqnum, Object **ret, uint64_t *offset) {
cec736d2
LP
1383 unsigned i;
1384 EntryItem *items;
1385 int r;
1386 uint64_t xor_hash = 0;
de190aef 1387 struct dual_timestamp _ts;
cec736d2
LP
1388
1389 assert(f);
1390 assert(iovec || n_iovec == 0);
1391
de190aef
LP
1392 if (!ts) {
1393 dual_timestamp_get(&_ts);
1394 ts = &_ts;
1395 }
1396
1397 if (f->tail_entry_monotonic_valid &&
1398 ts->monotonic < le64toh(f->header->tail_entry_monotonic))
1399 return -EINVAL;
1400
feb12d3e 1401#ifdef HAVE_GCRYPT
7560fffc
LP
1402 r = journal_file_maybe_append_tag(f, ts->realtime);
1403 if (r < 0)
1404 return r;
feb12d3e 1405#endif
7560fffc 1406
64825d3c 1407 /* alloca() can't take 0, hence let's allocate at least one */
9607d947 1408 items = alloca(sizeof(EntryItem) * MAX(1u, n_iovec));
cec736d2
LP
1409
1410 for (i = 0; i < n_iovec; i++) {
1411 uint64_t p;
1412 Object *o;
1413
1414 r = journal_file_append_data(f, iovec[i].iov_base, iovec[i].iov_len, &o, &p);
1415 if (r < 0)
cf244689 1416 return r;
cec736d2
LP
1417
1418 xor_hash ^= le64toh(o->data.hash);
1419 items[i].object_offset = htole64(p);
de7b95cd 1420 items[i].hash = o->data.hash;
cec736d2
LP
1421 }
1422
1f2da9ec
LP
1423 /* Order by the position on disk, in order to improve seek
1424 * times for rotating media. */
7ff7394d 1425 qsort_safe(items, n_iovec, sizeof(EntryItem), entry_item_cmp);
1f2da9ec 1426
de190aef 1427 r = journal_file_append_entry_internal(f, ts, xor_hash, items, n_iovec, seqnum, ret, offset);
cec736d2 1428
fa6ac760
LP
1429 /* If the memory mapping triggered a SIGBUS then we return an
1430 * IO error and ignore the error code passed down to us, since
1431 * it is very likely just an effect of a nullified replacement
1432 * mapping page */
1433
1434 if (mmap_cache_got_sigbus(f->mmap, f->fd))
1435 r = -EIO;
1436
50f20cfd
LP
1437 journal_file_post_change(f);
1438
cec736d2
LP
1439 return r;
1440}
1441
a4bcff5b 1442typedef struct ChainCacheItem {
fb099c8d 1443 uint64_t first; /* the array at the beginning of the chain */
a4bcff5b
LP
1444 uint64_t array; /* the cached array */
1445 uint64_t begin; /* the first item in the cached array */
1446 uint64_t total; /* the total number of items in all arrays before this one in the chain */
f268980d 1447 uint64_t last_index; /* the last index we looked at, to optimize locality when bisecting */
a4bcff5b
LP
1448} ChainCacheItem;
1449
1450static void chain_cache_put(
4743015d 1451 OrderedHashmap *h,
a4bcff5b
LP
1452 ChainCacheItem *ci,
1453 uint64_t first,
1454 uint64_t array,
1455 uint64_t begin,
f268980d
LP
1456 uint64_t total,
1457 uint64_t last_index) {
a4bcff5b
LP
1458
1459 if (!ci) {
34741aa3
LP
1460 /* If the chain item to cache for this chain is the
1461 * first one it's not worth caching anything */
1462 if (array == first)
1463 return;
1464
29433089 1465 if (ordered_hashmap_size(h) >= CHAIN_CACHE_MAX) {
4743015d 1466 ci = ordered_hashmap_steal_first(h);
29433089
LP
1467 assert(ci);
1468 } else {
a4bcff5b
LP
1469 ci = new(ChainCacheItem, 1);
1470 if (!ci)
1471 return;
1472 }
1473
1474 ci->first = first;
1475
4743015d 1476 if (ordered_hashmap_put(h, &ci->first, ci) < 0) {
a4bcff5b
LP
1477 free(ci);
1478 return;
1479 }
1480 } else
1481 assert(ci->first == first);
1482
1483 ci->array = array;
1484 ci->begin = begin;
1485 ci->total = total;
f268980d 1486 ci->last_index = last_index;
a4bcff5b
LP
1487}
1488
f268980d
LP
1489static int generic_array_get(
1490 JournalFile *f,
1491 uint64_t first,
1492 uint64_t i,
1493 Object **ret, uint64_t *offset) {
de190aef 1494
cec736d2 1495 Object *o;
a4bcff5b 1496 uint64_t p = 0, a, t = 0;
cec736d2 1497 int r;
a4bcff5b 1498 ChainCacheItem *ci;
cec736d2
LP
1499
1500 assert(f);
1501
de190aef 1502 a = first;
a4bcff5b
LP
1503
1504 /* Try the chain cache first */
4743015d 1505 ci = ordered_hashmap_get(f->chain_cache, &first);
a4bcff5b
LP
1506 if (ci && i > ci->total) {
1507 a = ci->array;
1508 i -= ci->total;
1509 t = ci->total;
1510 }
1511
de190aef 1512 while (a > 0) {
a4bcff5b 1513 uint64_t k;
cec736d2 1514
de190aef
LP
1515 r = journal_file_move_to_object(f, OBJECT_ENTRY_ARRAY, a, &o);
1516 if (r < 0)
1517 return r;
cec736d2 1518
a4bcff5b
LP
1519 k = journal_file_entry_array_n_items(o);
1520 if (i < k) {
de190aef 1521 p = le64toh(o->entry_array.items[i]);
a4bcff5b 1522 goto found;
cec736d2
LP
1523 }
1524
a4bcff5b
LP
1525 i -= k;
1526 t += k;
de190aef
LP
1527 a = le64toh(o->entry_array.next_entry_array_offset);
1528 }
1529
a4bcff5b
LP
1530 return 0;
1531
1532found:
1533 /* Let's cache this item for the next invocation */
af13a6b0 1534 chain_cache_put(f->chain_cache, ci, first, a, le64toh(o->entry_array.items[0]), t, i);
de190aef
LP
1535
1536 r = journal_file_move_to_object(f, OBJECT_ENTRY, p, &o);
1537 if (r < 0)
1538 return r;
1539
1540 if (ret)
1541 *ret = o;
1542
1543 if (offset)
1544 *offset = p;
1545
1546 return 1;
1547}
1548
f268980d
LP
1549static int generic_array_get_plus_one(
1550 JournalFile *f,
1551 uint64_t extra,
1552 uint64_t first,
1553 uint64_t i,
1554 Object **ret, uint64_t *offset) {
de190aef
LP
1555
1556 Object *o;
1557
1558 assert(f);
1559
1560 if (i == 0) {
1561 int r;
1562
1563 r = journal_file_move_to_object(f, OBJECT_ENTRY, extra, &o);
cec736d2
LP
1564 if (r < 0)
1565 return r;
1566
de190aef
LP
1567 if (ret)
1568 *ret = o;
cec736d2 1569
de190aef
LP
1570 if (offset)
1571 *offset = extra;
cec736d2 1572
de190aef 1573 return 1;
cec736d2
LP
1574 }
1575
de190aef
LP
1576 return generic_array_get(f, first, i-1, ret, offset);
1577}
cec736d2 1578
de190aef
LP
1579enum {
1580 TEST_FOUND,
1581 TEST_LEFT,
1582 TEST_RIGHT
1583};
cec736d2 1584
f268980d
LP
1585static int generic_array_bisect(
1586 JournalFile *f,
1587 uint64_t first,
1588 uint64_t n,
1589 uint64_t needle,
1590 int (*test_object)(JournalFile *f, uint64_t p, uint64_t needle),
1591 direction_t direction,
1592 Object **ret,
1593 uint64_t *offset,
1594 uint64_t *idx) {
1595
1596 uint64_t a, p, t = 0, i = 0, last_p = 0, last_index = (uint64_t) -1;
de190aef
LP
1597 bool subtract_one = false;
1598 Object *o, *array = NULL;
1599 int r;
a4bcff5b 1600 ChainCacheItem *ci;
cec736d2 1601
de190aef
LP
1602 assert(f);
1603 assert(test_object);
cec736d2 1604
a4bcff5b 1605 /* Start with the first array in the chain */
de190aef 1606 a = first;
a4bcff5b 1607
4743015d 1608 ci = ordered_hashmap_get(f->chain_cache, &first);
a4bcff5b
LP
1609 if (ci && n > ci->total) {
1610 /* Ah, we have iterated this bisection array chain
1611 * previously! Let's see if we can skip ahead in the
1612 * chain, as far as the last time. But we can't jump
1613 * backwards in the chain, so let's check that
1614 * first. */
1615
1616 r = test_object(f, ci->begin, needle);
1617 if (r < 0)
1618 return r;
1619
1620 if (r == TEST_LEFT) {
f268980d 1621 /* OK, what we are looking for is right of the
a4bcff5b
LP
1622 * begin of this EntryArray, so let's jump
1623 * straight to previously cached array in the
1624 * chain */
1625
1626 a = ci->array;
1627 n -= ci->total;
1628 t = ci->total;
f268980d 1629 last_index = ci->last_index;
a4bcff5b
LP
1630 }
1631 }
1632
de190aef
LP
1633 while (a > 0) {
1634 uint64_t left, right, k, lp;
1635
1636 r = journal_file_move_to_object(f, OBJECT_ENTRY_ARRAY, a, &array);
cec736d2
LP
1637 if (r < 0)
1638 return r;
1639
de190aef
LP
1640 k = journal_file_entry_array_n_items(array);
1641 right = MIN(k, n);
1642 if (right <= 0)
1643 return 0;
cec736d2 1644
de190aef
LP
1645 i = right - 1;
1646 lp = p = le64toh(array->entry_array.items[i]);
1647 if (p <= 0)
1648 return -EBADMSG;
cec736d2 1649
de190aef
LP
1650 r = test_object(f, p, needle);
1651 if (r < 0)
1652 return r;
cec736d2 1653
de190aef
LP
1654 if (r == TEST_FOUND)
1655 r = direction == DIRECTION_DOWN ? TEST_RIGHT : TEST_LEFT;
1656
1657 if (r == TEST_RIGHT) {
1658 left = 0;
1659 right -= 1;
f268980d
LP
1660
1661 if (last_index != (uint64_t) -1) {
1662 assert(last_index <= right);
1663
1664 /* If we cached the last index we
1665 * looked at, let's try to not to jump
1666 * too wildly around and see if we can
1667 * limit the range to look at early to
1668 * the immediate neighbors of the last
1669 * index we looked at. */
1670
1671 if (last_index > 0) {
1672 uint64_t x = last_index - 1;
1673
1674 p = le64toh(array->entry_array.items[x]);
1675 if (p <= 0)
1676 return -EBADMSG;
1677
1678 r = test_object(f, p, needle);
1679 if (r < 0)
1680 return r;
1681
1682 if (r == TEST_FOUND)
1683 r = direction == DIRECTION_DOWN ? TEST_RIGHT : TEST_LEFT;
1684
1685 if (r == TEST_RIGHT)
1686 right = x;
1687 else
1688 left = x + 1;
1689 }
1690
1691 if (last_index < right) {
1692 uint64_t y = last_index + 1;
1693
1694 p = le64toh(array->entry_array.items[y]);
1695 if (p <= 0)
1696 return -EBADMSG;
1697
1698 r = test_object(f, p, needle);
1699 if (r < 0)
1700 return r;
1701
1702 if (r == TEST_FOUND)
1703 r = direction == DIRECTION_DOWN ? TEST_RIGHT : TEST_LEFT;
1704
1705 if (r == TEST_RIGHT)
1706 right = y;
1707 else
1708 left = y + 1;
1709 }
f268980d
LP
1710 }
1711
de190aef
LP
1712 for (;;) {
1713 if (left == right) {
1714 if (direction == DIRECTION_UP)
1715 subtract_one = true;
1716
1717 i = left;
1718 goto found;
1719 }
1720
1721 assert(left < right);
de190aef 1722 i = (left + right) / 2;
f268980d 1723
de190aef
LP
1724 p = le64toh(array->entry_array.items[i]);
1725 if (p <= 0)
1726 return -EBADMSG;
1727
1728 r = test_object(f, p, needle);
1729 if (r < 0)
1730 return r;
cec736d2 1731
de190aef
LP
1732 if (r == TEST_FOUND)
1733 r = direction == DIRECTION_DOWN ? TEST_RIGHT : TEST_LEFT;
1734
1735 if (r == TEST_RIGHT)
1736 right = i;
1737 else
1738 left = i + 1;
1739 }
1740 }
1741
2173cbf8 1742 if (k >= n) {
cbdca852
LP
1743 if (direction == DIRECTION_UP) {
1744 i = n;
1745 subtract_one = true;
1746 goto found;
1747 }
1748
cec736d2 1749 return 0;
cbdca852 1750 }
cec736d2 1751
de190aef
LP
1752 last_p = lp;
1753
1754 n -= k;
1755 t += k;
f268980d 1756 last_index = (uint64_t) -1;
de190aef 1757 a = le64toh(array->entry_array.next_entry_array_offset);
cec736d2
LP
1758 }
1759
1760 return 0;
de190aef
LP
1761
1762found:
1763 if (subtract_one && t == 0 && i == 0)
1764 return 0;
1765
a4bcff5b 1766 /* Let's cache this item for the next invocation */
af13a6b0 1767 chain_cache_put(f->chain_cache, ci, first, a, le64toh(array->entry_array.items[0]), t, subtract_one ? (i > 0 ? i-1 : (uint64_t) -1) : i);
a4bcff5b 1768
de190aef
LP
1769 if (subtract_one && i == 0)
1770 p = last_p;
1771 else if (subtract_one)
1772 p = le64toh(array->entry_array.items[i-1]);
1773 else
1774 p = le64toh(array->entry_array.items[i]);
1775
1776 r = journal_file_move_to_object(f, OBJECT_ENTRY, p, &o);
1777 if (r < 0)
1778 return r;
1779
1780 if (ret)
1781 *ret = o;
1782
1783 if (offset)
1784 *offset = p;
1785
1786 if (idx)
cbdca852 1787 *idx = t + i + (subtract_one ? -1 : 0);
de190aef
LP
1788
1789 return 1;
cec736d2
LP
1790}
1791
f268980d
LP
1792static int generic_array_bisect_plus_one(
1793 JournalFile *f,
1794 uint64_t extra,
1795 uint64_t first,
1796 uint64_t n,
1797 uint64_t needle,
1798 int (*test_object)(JournalFile *f, uint64_t p, uint64_t needle),
1799 direction_t direction,
1800 Object **ret,
1801 uint64_t *offset,
1802 uint64_t *idx) {
de190aef 1803
cec736d2 1804 int r;
cbdca852
LP
1805 bool step_back = false;
1806 Object *o;
cec736d2
LP
1807
1808 assert(f);
de190aef 1809 assert(test_object);
cec736d2 1810
de190aef
LP
1811 if (n <= 0)
1812 return 0;
cec736d2 1813
de190aef
LP
1814 /* This bisects the array in object 'first', but first checks
1815 * an extra */
de190aef
LP
1816 r = test_object(f, extra, needle);
1817 if (r < 0)
1818 return r;
a536e261
LP
1819
1820 if (r == TEST_FOUND)
1821 r = direction == DIRECTION_DOWN ? TEST_RIGHT : TEST_LEFT;
1822
cbdca852
LP
1823 /* if we are looking with DIRECTION_UP then we need to first
1824 see if in the actual array there is a matching entry, and
1825 return the last one of that. But if there isn't any we need
1826 to return this one. Hence remember this, and return it
1827 below. */
1828 if (r == TEST_LEFT)
1829 step_back = direction == DIRECTION_UP;
de190aef 1830
cbdca852
LP
1831 if (r == TEST_RIGHT) {
1832 if (direction == DIRECTION_DOWN)
1833 goto found;
1834 else
1835 return 0;
a536e261 1836 }
cec736d2 1837
de190aef
LP
1838 r = generic_array_bisect(f, first, n-1, needle, test_object, direction, ret, offset, idx);
1839
cbdca852
LP
1840 if (r == 0 && step_back)
1841 goto found;
1842
ecf68b1d 1843 if (r > 0 && idx)
de190aef
LP
1844 (*idx) ++;
1845
1846 return r;
cbdca852
LP
1847
1848found:
1849 r = journal_file_move_to_object(f, OBJECT_ENTRY, extra, &o);
1850 if (r < 0)
1851 return r;
1852
1853 if (ret)
1854 *ret = o;
1855
1856 if (offset)
1857 *offset = extra;
1858
1859 if (idx)
1860 *idx = 0;
1861
1862 return 1;
1863}
1864
44a6b1b6 1865_pure_ static int test_object_offset(JournalFile *f, uint64_t p, uint64_t needle) {
cbdca852
LP
1866 assert(f);
1867 assert(p > 0);
1868
1869 if (p == needle)
1870 return TEST_FOUND;
1871 else if (p < needle)
1872 return TEST_LEFT;
1873 else
1874 return TEST_RIGHT;
1875}
1876
de190aef
LP
1877static int test_object_seqnum(JournalFile *f, uint64_t p, uint64_t needle) {
1878 Object *o;
1879 int r;
1880
1881 assert(f);
1882 assert(p > 0);
1883
1884 r = journal_file_move_to_object(f, OBJECT_ENTRY, p, &o);
cec736d2
LP
1885 if (r < 0)
1886 return r;
1887
de190aef
LP
1888 if (le64toh(o->entry.seqnum) == needle)
1889 return TEST_FOUND;
1890 else if (le64toh(o->entry.seqnum) < needle)
1891 return TEST_LEFT;
1892 else
1893 return TEST_RIGHT;
1894}
cec736d2 1895
de190aef
LP
1896int journal_file_move_to_entry_by_seqnum(
1897 JournalFile *f,
1898 uint64_t seqnum,
1899 direction_t direction,
1900 Object **ret,
1901 uint64_t *offset) {
1902
1903 return generic_array_bisect(f,
1904 le64toh(f->header->entry_array_offset),
1905 le64toh(f->header->n_entries),
1906 seqnum,
1907 test_object_seqnum,
1908 direction,
1909 ret, offset, NULL);
1910}
cec736d2 1911
de190aef
LP
1912static int test_object_realtime(JournalFile *f, uint64_t p, uint64_t needle) {
1913 Object *o;
1914 int r;
1915
1916 assert(f);
1917 assert(p > 0);
1918
1919 r = journal_file_move_to_object(f, OBJECT_ENTRY, p, &o);
1920 if (r < 0)
1921 return r;
1922
1923 if (le64toh(o->entry.realtime) == needle)
1924 return TEST_FOUND;
1925 else if (le64toh(o->entry.realtime) < needle)
1926 return TEST_LEFT;
1927 else
1928 return TEST_RIGHT;
cec736d2
LP
1929}
1930
de190aef
LP
1931int journal_file_move_to_entry_by_realtime(
1932 JournalFile *f,
1933 uint64_t realtime,
1934 direction_t direction,
1935 Object **ret,
1936 uint64_t *offset) {
1937
1938 return generic_array_bisect(f,
1939 le64toh(f->header->entry_array_offset),
1940 le64toh(f->header->n_entries),
1941 realtime,
1942 test_object_realtime,
1943 direction,
1944 ret, offset, NULL);
1945}
1946
1947static int test_object_monotonic(JournalFile *f, uint64_t p, uint64_t needle) {
1948 Object *o;
1949 int r;
1950
1951 assert(f);
1952 assert(p > 0);
1953
1954 r = journal_file_move_to_object(f, OBJECT_ENTRY, p, &o);
1955 if (r < 0)
1956 return r;
1957
1958 if (le64toh(o->entry.monotonic) == needle)
1959 return TEST_FOUND;
1960 else if (le64toh(o->entry.monotonic) < needle)
1961 return TEST_LEFT;
1962 else
1963 return TEST_RIGHT;
1964}
1965
2a560338 1966static int find_data_object_by_boot_id(
47838ab3
ZJS
1967 JournalFile *f,
1968 sd_id128_t boot_id,
1969 Object **o,
1970 uint64_t *b) {
2a560338 1971
47838ab3
ZJS
1972 char t[sizeof("_BOOT_ID=")-1 + 32 + 1] = "_BOOT_ID=";
1973
1974 sd_id128_to_string(boot_id, t + 9);
1975 return journal_file_find_data_object(f, t, sizeof(t) - 1, o, b);
1976}
1977
de190aef
LP
1978int journal_file_move_to_entry_by_monotonic(
1979 JournalFile *f,
1980 sd_id128_t boot_id,
1981 uint64_t monotonic,
1982 direction_t direction,
1983 Object **ret,
1984 uint64_t *offset) {
1985
de190aef
LP
1986 Object *o;
1987 int r;
1988
cbdca852 1989 assert(f);
de190aef 1990
47838ab3 1991 r = find_data_object_by_boot_id(f, boot_id, &o, NULL);
de190aef
LP
1992 if (r < 0)
1993 return r;
cbdca852 1994 if (r == 0)
de190aef
LP
1995 return -ENOENT;
1996
1997 return generic_array_bisect_plus_one(f,
1998 le64toh(o->data.entry_offset),
1999 le64toh(o->data.entry_array_offset),
2000 le64toh(o->data.n_entries),
2001 monotonic,
2002 test_object_monotonic,
2003 direction,
2004 ret, offset, NULL);
2005}
2006
1fc605b0 2007void journal_file_reset_location(JournalFile *f) {
6573ef05 2008 f->location_type = LOCATION_HEAD;
1fc605b0 2009 f->current_offset = 0;
6573ef05
MS
2010 f->current_seqnum = 0;
2011 f->current_realtime = 0;
2012 f->current_monotonic = 0;
2013 zero(f->current_boot_id);
2014 f->current_xor_hash = 0;
2015}
2016
950c07d4 2017void journal_file_save_location(JournalFile *f, Object *o, uint64_t offset) {
6573ef05
MS
2018 f->location_type = LOCATION_SEEK;
2019 f->current_offset = offset;
2020 f->current_seqnum = le64toh(o->entry.seqnum);
2021 f->current_realtime = le64toh(o->entry.realtime);
2022 f->current_monotonic = le64toh(o->entry.monotonic);
2023 f->current_boot_id = o->entry.boot_id;
2024 f->current_xor_hash = le64toh(o->entry.xor_hash);
1fc605b0
MS
2025}
2026
d8ae66d7
MS
2027int journal_file_compare_locations(JournalFile *af, JournalFile *bf) {
2028 assert(af);
2029 assert(bf);
2030 assert(af->location_type == LOCATION_SEEK);
2031 assert(bf->location_type == LOCATION_SEEK);
2032
2033 /* If contents and timestamps match, these entries are
2034 * identical, even if the seqnum does not match */
2035 if (sd_id128_equal(af->current_boot_id, bf->current_boot_id) &&
2036 af->current_monotonic == bf->current_monotonic &&
2037 af->current_realtime == bf->current_realtime &&
2038 af->current_xor_hash == bf->current_xor_hash)
2039 return 0;
2040
2041 if (sd_id128_equal(af->header->seqnum_id, bf->header->seqnum_id)) {
2042
2043 /* If this is from the same seqnum source, compare
2044 * seqnums */
2045 if (af->current_seqnum < bf->current_seqnum)
2046 return -1;
2047 if (af->current_seqnum > bf->current_seqnum)
2048 return 1;
2049
2050 /* Wow! This is weird, different data but the same
2051 * seqnums? Something is borked, but let's make the
2052 * best of it and compare by time. */
2053 }
2054
2055 if (sd_id128_equal(af->current_boot_id, bf->current_boot_id)) {
2056
2057 /* If the boot id matches, compare monotonic time */
2058 if (af->current_monotonic < bf->current_monotonic)
2059 return -1;
2060 if (af->current_monotonic > bf->current_monotonic)
2061 return 1;
2062 }
2063
2064 /* Otherwise, compare UTC time */
2065 if (af->current_realtime < bf->current_realtime)
2066 return -1;
2067 if (af->current_realtime > bf->current_realtime)
2068 return 1;
2069
2070 /* Finally, compare by contents */
2071 if (af->current_xor_hash < bf->current_xor_hash)
2072 return -1;
2073 if (af->current_xor_hash > bf->current_xor_hash)
2074 return 1;
2075
2076 return 0;
2077}
2078
de190aef
LP
2079int journal_file_next_entry(
2080 JournalFile *f,
f534928a 2081 uint64_t p,
de190aef
LP
2082 direction_t direction,
2083 Object **ret, uint64_t *offset) {
2084
fb099c8d 2085 uint64_t i, n, ofs;
cec736d2
LP
2086 int r;
2087
2088 assert(f);
de190aef
LP
2089
2090 n = le64toh(f->header->n_entries);
2091 if (n <= 0)
2092 return 0;
cec736d2 2093
f534928a 2094 if (p == 0)
de190aef 2095 i = direction == DIRECTION_DOWN ? 0 : n - 1;
cec736d2 2096 else {
de190aef
LP
2097 r = generic_array_bisect(f,
2098 le64toh(f->header->entry_array_offset),
2099 le64toh(f->header->n_entries),
2100 p,
2101 test_object_offset,
2102 DIRECTION_DOWN,
2103 NULL, NULL,
2104 &i);
2105 if (r <= 0)
2106 return r;
2107
2108 if (direction == DIRECTION_DOWN) {
2109 if (i >= n - 1)
2110 return 0;
2111
2112 i++;
2113 } else {
2114 if (i <= 0)
2115 return 0;
2116
2117 i--;
2118 }
cec736d2
LP
2119 }
2120
de190aef 2121 /* And jump to it */
fb099c8d
ZJS
2122 r = generic_array_get(f,
2123 le64toh(f->header->entry_array_offset),
2124 i,
2125 ret, &ofs);
2126 if (r <= 0)
2127 return r;
2128
2129 if (p > 0 &&
2130 (direction == DIRECTION_DOWN ? ofs <= p : ofs >= p)) {
2131 log_debug("%s: entry array corrupted at entry %"PRIu64,
2132 f->path, i);
2133 return -EBADMSG;
2134 }
2135
2136 if (offset)
2137 *offset = ofs;
2138
2139 return 1;
de190aef 2140}
cec736d2 2141
de190aef
LP
2142int journal_file_next_entry_for_data(
2143 JournalFile *f,
2144 Object *o, uint64_t p,
2145 uint64_t data_offset,
2146 direction_t direction,
2147 Object **ret, uint64_t *offset) {
2148
2149 uint64_t n, i;
cec736d2 2150 int r;
de190aef 2151 Object *d;
cec736d2
LP
2152
2153 assert(f);
de190aef 2154 assert(p > 0 || !o);
cec736d2 2155
de190aef 2156 r = journal_file_move_to_object(f, OBJECT_DATA, data_offset, &d);
466ccd92 2157 if (r < 0)
de190aef 2158 return r;
cec736d2 2159
de190aef
LP
2160 n = le64toh(d->data.n_entries);
2161 if (n <= 0)
2162 return n;
cec736d2 2163
de190aef
LP
2164 if (!o)
2165 i = direction == DIRECTION_DOWN ? 0 : n - 1;
2166 else {
2167 if (o->object.type != OBJECT_ENTRY)
2168 return -EINVAL;
cec736d2 2169
de190aef
LP
2170 r = generic_array_bisect_plus_one(f,
2171 le64toh(d->data.entry_offset),
2172 le64toh(d->data.entry_array_offset),
2173 le64toh(d->data.n_entries),
2174 p,
2175 test_object_offset,
2176 DIRECTION_DOWN,
2177 NULL, NULL,
2178 &i);
2179
2180 if (r <= 0)
cec736d2
LP
2181 return r;
2182
de190aef
LP
2183 if (direction == DIRECTION_DOWN) {
2184 if (i >= n - 1)
2185 return 0;
cec736d2 2186
de190aef
LP
2187 i++;
2188 } else {
2189 if (i <= 0)
2190 return 0;
cec736d2 2191
de190aef
LP
2192 i--;
2193 }
cec736d2 2194
de190aef 2195 }
cec736d2 2196
de190aef
LP
2197 return generic_array_get_plus_one(f,
2198 le64toh(d->data.entry_offset),
2199 le64toh(d->data.entry_array_offset),
2200 i,
2201 ret, offset);
2202}
cec736d2 2203
cbdca852
LP
2204int journal_file_move_to_entry_by_offset_for_data(
2205 JournalFile *f,
2206 uint64_t data_offset,
2207 uint64_t p,
2208 direction_t direction,
2209 Object **ret, uint64_t *offset) {
2210
2211 int r;
2212 Object *d;
2213
2214 assert(f);
2215
2216 r = journal_file_move_to_object(f, OBJECT_DATA, data_offset, &d);
2217 if (r < 0)
2218 return r;
2219
2220 return generic_array_bisect_plus_one(f,
2221 le64toh(d->data.entry_offset),
2222 le64toh(d->data.entry_array_offset),
2223 le64toh(d->data.n_entries),
2224 p,
2225 test_object_offset,
2226 direction,
2227 ret, offset, NULL);
2228}
2229
2230int journal_file_move_to_entry_by_monotonic_for_data(
2231 JournalFile *f,
2232 uint64_t data_offset,
2233 sd_id128_t boot_id,
2234 uint64_t monotonic,
2235 direction_t direction,
2236 Object **ret, uint64_t *offset) {
2237
cbdca852
LP
2238 Object *o, *d;
2239 int r;
2240 uint64_t b, z;
2241
2242 assert(f);
2243
2244 /* First, seek by time */
47838ab3 2245 r = find_data_object_by_boot_id(f, boot_id, &o, &b);
cbdca852
LP
2246 if (r < 0)
2247 return r;
2248 if (r == 0)
2249 return -ENOENT;
2250
2251 r = generic_array_bisect_plus_one(f,
2252 le64toh(o->data.entry_offset),
2253 le64toh(o->data.entry_array_offset),
2254 le64toh(o->data.n_entries),
2255 monotonic,
2256 test_object_monotonic,
2257 direction,
2258 NULL, &z, NULL);
2259 if (r <= 0)
2260 return r;
2261
2262 /* And now, continue seeking until we find an entry that
2263 * exists in both bisection arrays */
2264
2265 for (;;) {
2266 Object *qo;
2267 uint64_t p, q;
2268
2269 r = journal_file_move_to_object(f, OBJECT_DATA, data_offset, &d);
2270 if (r < 0)
2271 return r;
2272
2273 r = generic_array_bisect_plus_one(f,
2274 le64toh(d->data.entry_offset),
2275 le64toh(d->data.entry_array_offset),
2276 le64toh(d->data.n_entries),
2277 z,
2278 test_object_offset,
2279 direction,
2280 NULL, &p, NULL);
2281 if (r <= 0)
2282 return r;
2283
2284 r = journal_file_move_to_object(f, OBJECT_DATA, b, &o);
2285 if (r < 0)
2286 return r;
2287
2288 r = generic_array_bisect_plus_one(f,
2289 le64toh(o->data.entry_offset),
2290 le64toh(o->data.entry_array_offset),
2291 le64toh(o->data.n_entries),
2292 p,
2293 test_object_offset,
2294 direction,
2295 &qo, &q, NULL);
2296
2297 if (r <= 0)
2298 return r;
2299
2300 if (p == q) {
2301 if (ret)
2302 *ret = qo;
2303 if (offset)
2304 *offset = q;
2305
2306 return 1;
2307 }
2308
2309 z = q;
2310 }
cbdca852
LP
2311}
2312
de190aef
LP
2313int journal_file_move_to_entry_by_seqnum_for_data(
2314 JournalFile *f,
2315 uint64_t data_offset,
2316 uint64_t seqnum,
2317 direction_t direction,
2318 Object **ret, uint64_t *offset) {
cec736d2 2319
de190aef
LP
2320 Object *d;
2321 int r;
cec736d2 2322
91a31dde
LP
2323 assert(f);
2324
de190aef 2325 r = journal_file_move_to_object(f, OBJECT_DATA, data_offset, &d);
91a31dde 2326 if (r < 0)
de190aef 2327 return r;
cec736d2 2328
de190aef
LP
2329 return generic_array_bisect_plus_one(f,
2330 le64toh(d->data.entry_offset),
2331 le64toh(d->data.entry_array_offset),
2332 le64toh(d->data.n_entries),
2333 seqnum,
2334 test_object_seqnum,
2335 direction,
2336 ret, offset, NULL);
2337}
cec736d2 2338
de190aef
LP
2339int journal_file_move_to_entry_by_realtime_for_data(
2340 JournalFile *f,
2341 uint64_t data_offset,
2342 uint64_t realtime,
2343 direction_t direction,
2344 Object **ret, uint64_t *offset) {
2345
2346 Object *d;
2347 int r;
2348
91a31dde
LP
2349 assert(f);
2350
de190aef 2351 r = journal_file_move_to_object(f, OBJECT_DATA, data_offset, &d);
91a31dde 2352 if (r < 0)
de190aef
LP
2353 return r;
2354
2355 return generic_array_bisect_plus_one(f,
2356 le64toh(d->data.entry_offset),
2357 le64toh(d->data.entry_array_offset),
2358 le64toh(d->data.n_entries),
2359 realtime,
2360 test_object_realtime,
2361 direction,
2362 ret, offset, NULL);
cec736d2
LP
2363}
2364
0284adc6 2365void journal_file_dump(JournalFile *f) {
7560fffc 2366 Object *o;
7560fffc 2367 int r;
0284adc6 2368 uint64_t p;
7560fffc
LP
2369
2370 assert(f);
2371
0284adc6 2372 journal_file_print_header(f);
7560fffc 2373
0284adc6
LP
2374 p = le64toh(f->header->header_size);
2375 while (p != 0) {
d05089d8 2376 r = journal_file_move_to_object(f, OBJECT_UNUSED, p, &o);
0284adc6
LP
2377 if (r < 0)
2378 goto fail;
7560fffc 2379
0284adc6 2380 switch (o->object.type) {
d98cc1f2 2381
0284adc6
LP
2382 case OBJECT_UNUSED:
2383 printf("Type: OBJECT_UNUSED\n");
2384 break;
d98cc1f2 2385
0284adc6
LP
2386 case OBJECT_DATA:
2387 printf("Type: OBJECT_DATA\n");
2388 break;
7560fffc 2389
3c1668da
LP
2390 case OBJECT_FIELD:
2391 printf("Type: OBJECT_FIELD\n");
2392 break;
2393
0284adc6 2394 case OBJECT_ENTRY:
507f22bd
ZJS
2395 printf("Type: OBJECT_ENTRY seqnum=%"PRIu64" monotonic=%"PRIu64" realtime=%"PRIu64"\n",
2396 le64toh(o->entry.seqnum),
2397 le64toh(o->entry.monotonic),
2398 le64toh(o->entry.realtime));
0284adc6 2399 break;
7560fffc 2400
0284adc6
LP
2401 case OBJECT_FIELD_HASH_TABLE:
2402 printf("Type: OBJECT_FIELD_HASH_TABLE\n");
2403 break;
7560fffc 2404
0284adc6
LP
2405 case OBJECT_DATA_HASH_TABLE:
2406 printf("Type: OBJECT_DATA_HASH_TABLE\n");
2407 break;
7560fffc 2408
0284adc6
LP
2409 case OBJECT_ENTRY_ARRAY:
2410 printf("Type: OBJECT_ENTRY_ARRAY\n");
2411 break;
7560fffc 2412
0284adc6 2413 case OBJECT_TAG:
507f22bd
ZJS
2414 printf("Type: OBJECT_TAG seqnum=%"PRIu64" epoch=%"PRIu64"\n",
2415 le64toh(o->tag.seqnum),
2416 le64toh(o->tag.epoch));
0284adc6 2417 break;
3c1668da
LP
2418
2419 default:
8facc349 2420 printf("Type: unknown (%i)\n", o->object.type);
3c1668da 2421 break;
0284adc6 2422 }
7560fffc 2423
d89c8fdf
ZJS
2424 if (o->object.flags & OBJECT_COMPRESSION_MASK)
2425 printf("Flags: %s\n",
2426 object_compressed_to_string(o->object.flags & OBJECT_COMPRESSION_MASK));
7560fffc 2427
0284adc6
LP
2428 if (p == le64toh(f->header->tail_object_offset))
2429 p = 0;
2430 else
2431 p = p + ALIGN64(le64toh(o->object.size));
2432 }
7560fffc 2433
0284adc6
LP
2434 return;
2435fail:
2436 log_error("File corrupt");
7560fffc
LP
2437}
2438
718fe4b1
ZJS
2439static const char* format_timestamp_safe(char *buf, size_t l, usec_t t) {
2440 const char *x;
2441
2442 x = format_timestamp(buf, l, t);
2443 if (x)
2444 return x;
2445 return " --- ";
2446}
2447
0284adc6 2448void journal_file_print_header(JournalFile *f) {
2765b7bb 2449 char a[33], b[33], c[33], d[33];
ed375beb 2450 char x[FORMAT_TIMESTAMP_MAX], y[FORMAT_TIMESTAMP_MAX], z[FORMAT_TIMESTAMP_MAX];
a1a03e30
LP
2451 struct stat st;
2452 char bytes[FORMAT_BYTES_MAX];
7560fffc
LP
2453
2454 assert(f);
7560fffc 2455
0284adc6
LP
2456 printf("File Path: %s\n"
2457 "File ID: %s\n"
2458 "Machine ID: %s\n"
2459 "Boot ID: %s\n"
2460 "Sequential Number ID: %s\n"
2461 "State: %s\n"
2462 "Compatible Flags:%s%s\n"
d89c8fdf 2463 "Incompatible Flags:%s%s%s\n"
507f22bd
ZJS
2464 "Header size: %"PRIu64"\n"
2465 "Arena size: %"PRIu64"\n"
2466 "Data Hash Table Size: %"PRIu64"\n"
2467 "Field Hash Table Size: %"PRIu64"\n"
0284adc6 2468 "Rotate Suggested: %s\n"
507f22bd
ZJS
2469 "Head Sequential Number: %"PRIu64"\n"
2470 "Tail Sequential Number: %"PRIu64"\n"
0284adc6 2471 "Head Realtime Timestamp: %s\n"
3223f44f 2472 "Tail Realtime Timestamp: %s\n"
ed375beb 2473 "Tail Monotonic Timestamp: %s\n"
507f22bd
ZJS
2474 "Objects: %"PRIu64"\n"
2475 "Entry Objects: %"PRIu64"\n",
0284adc6
LP
2476 f->path,
2477 sd_id128_to_string(f->header->file_id, a),
2478 sd_id128_to_string(f->header->machine_id, b),
2479 sd_id128_to_string(f->header->boot_id, c),
2765b7bb 2480 sd_id128_to_string(f->header->seqnum_id, d),
3223f44f
LP
2481 f->header->state == STATE_OFFLINE ? "OFFLINE" :
2482 f->header->state == STATE_ONLINE ? "ONLINE" :
2483 f->header->state == STATE_ARCHIVED ? "ARCHIVED" : "UNKNOWN",
8088cbd3 2484 JOURNAL_HEADER_SEALED(f->header) ? " SEALED" : "",
d89c8fdf
ZJS
2485 (le32toh(f->header->compatible_flags) & ~HEADER_COMPATIBLE_ANY) ? " ???" : "",
2486 JOURNAL_HEADER_COMPRESSED_XZ(f->header) ? " COMPRESSED-XZ" : "",
2487 JOURNAL_HEADER_COMPRESSED_LZ4(f->header) ? " COMPRESSED-LZ4" : "",
2488 (le32toh(f->header->incompatible_flags) & ~HEADER_INCOMPATIBLE_ANY) ? " ???" : "",
507f22bd
ZJS
2489 le64toh(f->header->header_size),
2490 le64toh(f->header->arena_size),
2491 le64toh(f->header->data_hash_table_size) / sizeof(HashItem),
2492 le64toh(f->header->field_hash_table_size) / sizeof(HashItem),
fb0951b0 2493 yes_no(journal_file_rotate_suggested(f, 0)),
507f22bd
ZJS
2494 le64toh(f->header->head_entry_seqnum),
2495 le64toh(f->header->tail_entry_seqnum),
718fe4b1
ZJS
2496 format_timestamp_safe(x, sizeof(x), le64toh(f->header->head_entry_realtime)),
2497 format_timestamp_safe(y, sizeof(y), le64toh(f->header->tail_entry_realtime)),
ed375beb 2498 format_timespan(z, sizeof(z), le64toh(f->header->tail_entry_monotonic), USEC_PER_MSEC),
507f22bd
ZJS
2499 le64toh(f->header->n_objects),
2500 le64toh(f->header->n_entries));
7560fffc 2501
0284adc6 2502 if (JOURNAL_HEADER_CONTAINS(f->header, n_data))
507f22bd 2503 printf("Data Objects: %"PRIu64"\n"
0284adc6 2504 "Data Hash Table Fill: %.1f%%\n",
507f22bd 2505 le64toh(f->header->n_data),
0284adc6 2506 100.0 * (double) le64toh(f->header->n_data) / ((double) (le64toh(f->header->data_hash_table_size) / sizeof(HashItem))));
7560fffc 2507
0284adc6 2508 if (JOURNAL_HEADER_CONTAINS(f->header, n_fields))
507f22bd 2509 printf("Field Objects: %"PRIu64"\n"
0284adc6 2510 "Field Hash Table Fill: %.1f%%\n",
507f22bd 2511 le64toh(f->header->n_fields),
0284adc6 2512 100.0 * (double) le64toh(f->header->n_fields) / ((double) (le64toh(f->header->field_hash_table_size) / sizeof(HashItem))));
3223f44f
LP
2513
2514 if (JOURNAL_HEADER_CONTAINS(f->header, n_tags))
507f22bd
ZJS
2515 printf("Tag Objects: %"PRIu64"\n",
2516 le64toh(f->header->n_tags));
3223f44f 2517 if (JOURNAL_HEADER_CONTAINS(f->header, n_entry_arrays))
507f22bd
ZJS
2518 printf("Entry Array Objects: %"PRIu64"\n",
2519 le64toh(f->header->n_entry_arrays));
a1a03e30
LP
2520
2521 if (fstat(f->fd, &st) >= 0)
2522 printf("Disk usage: %s\n", format_bytes(bytes, sizeof(bytes), (off_t) st.st_blocks * 512ULL));
7560fffc
LP
2523}
2524
0284adc6
LP
2525int journal_file_open(
2526 const char *fname,
2527 int flags,
2528 mode_t mode,
2529 bool compress,
baed47c3 2530 bool seal,
0284adc6
LP
2531 JournalMetrics *metrics,
2532 MMapCache *mmap_cache,
2533 JournalFile *template,
2534 JournalFile **ret) {
7560fffc 2535
fa6ac760 2536 bool newly_created = false;
0284adc6 2537 JournalFile *f;
fa6ac760 2538 void *h;
0284adc6 2539 int r;
7560fffc 2540
0284adc6 2541 assert(fname);
0559d3a5 2542 assert(ret);
7560fffc 2543
0284adc6
LP
2544 if ((flags & O_ACCMODE) != O_RDONLY &&
2545 (flags & O_ACCMODE) != O_RDWR)
2546 return -EINVAL;
7560fffc 2547
a0108012
LP
2548 if (!endswith(fname, ".journal") &&
2549 !endswith(fname, ".journal~"))
0284adc6 2550 return -EINVAL;
7560fffc 2551
0284adc6
LP
2552 f = new0(JournalFile, 1);
2553 if (!f)
2554 return -ENOMEM;
7560fffc 2555
0284adc6
LP
2556 f->fd = -1;
2557 f->mode = mode;
7560fffc 2558
0284adc6
LP
2559 f->flags = flags;
2560 f->prot = prot_from_flags(flags);
2561 f->writable = (flags & O_ACCMODE) != O_RDONLY;
92261977 2562#if defined(HAVE_LZ4)
d89c8fdf
ZJS
2563 f->compress_lz4 = compress;
2564#elif defined(HAVE_XZ)
2565 f->compress_xz = compress;
48b61739 2566#endif
49a32d43 2567#ifdef HAVE_GCRYPT
baed47c3 2568 f->seal = seal;
49a32d43 2569#endif
7560fffc 2570
0284adc6
LP
2571 if (mmap_cache)
2572 f->mmap = mmap_cache_ref(mmap_cache);
2573 else {
84168d80 2574 f->mmap = mmap_cache_new();
0284adc6
LP
2575 if (!f->mmap) {
2576 r = -ENOMEM;
2577 goto fail;
2578 }
2579 }
7560fffc 2580
0284adc6
LP
2581 f->path = strdup(fname);
2582 if (!f->path) {
2583 r = -ENOMEM;
2584 goto fail;
2585 }
7560fffc 2586
4743015d 2587 f->chain_cache = ordered_hashmap_new(&uint64_hash_ops);
a4bcff5b
LP
2588 if (!f->chain_cache) {
2589 r = -ENOMEM;
2590 goto fail;
2591 }
2592
0284adc6
LP
2593 f->fd = open(f->path, f->flags|O_CLOEXEC, f->mode);
2594 if (f->fd < 0) {
2595 r = -errno;
2596 goto fail;
7560fffc 2597 }
7560fffc 2598
2678031a
LP
2599 r = journal_file_fstat(f);
2600 if (r < 0)
0284adc6 2601 goto fail;
7560fffc 2602
0284adc6 2603 if (f->last_stat.st_size == 0 && f->writable) {
11689d2a
LP
2604
2605 /* Before we write anything, turn off COW logic. Given
2606 * our write pattern that is quite unfriendly to COW
2607 * file systems this should greatly improve
2608 * performance on COW file systems, such as btrfs, at
2609 * the expense of data integrity features (which
2610 * shouldn't be too bad, given that we do our own
2611 * checksumming). */
1ed8f8c1 2612 r = chattr_fd(f->fd, FS_NOCOW_FL, FS_NOCOW_FL);
65eae3b7
CR
2613 if (r < 0 && r != -ENOTTY)
2614 log_warning_errno(r, "Failed to set file attributes: %m");
11689d2a 2615
fb0951b0
LP
2616 /* Let's attach the creation time to the journal file,
2617 * so that the vacuuming code knows the age of this
2618 * file even if the file might end up corrupted one
2619 * day... Ideally we'd just use the creation time many
2620 * file systems maintain for each file, but there is
2621 * currently no usable API to query this, hence let's
2622 * emulate this via extended attributes. If extended
2623 * attributes are not supported we'll just skip this,
7517e174 2624 * and rely solely on mtime/atime/ctime of the file. */
fb0951b0 2625
d61b600d 2626 fd_setcrtime(f->fd, 0);
7560fffc 2627
feb12d3e 2628#ifdef HAVE_GCRYPT
0284adc6 2629 /* Try to load the FSPRG state, and if we can't, then
baed47c3 2630 * just don't do sealing */
49a32d43
LP
2631 if (f->seal) {
2632 r = journal_file_fss_load(f);
2633 if (r < 0)
2634 f->seal = false;
2635 }
feb12d3e 2636#endif
7560fffc 2637
0284adc6
LP
2638 r = journal_file_init_header(f, template);
2639 if (r < 0)
2640 goto fail;
7560fffc 2641
2678031a
LP
2642 r = journal_file_fstat(f);
2643 if (r < 0)
0284adc6 2644 goto fail;
fb0951b0
LP
2645
2646 newly_created = true;
0284adc6 2647 }
7560fffc 2648
0284adc6
LP
2649 if (f->last_stat.st_size < (off_t) HEADER_SIZE_MIN) {
2650 r = -EIO;
2651 goto fail;
2652 }
7560fffc 2653
fa6ac760 2654 r = mmap_cache_get(f->mmap, f->fd, f->prot, CONTEXT_HEADER, true, 0, PAGE_ALIGN(sizeof(Header)), &f->last_stat, &h);
977eaa1e 2655 if (r < 0)
0284adc6 2656 goto fail;
7560fffc 2657
fa6ac760
LP
2658 f->header = h;
2659
0284adc6
LP
2660 if (!newly_created) {
2661 r = journal_file_verify_header(f);
2662 if (r < 0)
2663 goto fail;
2664 }
7560fffc 2665
feb12d3e 2666#ifdef HAVE_GCRYPT
0284adc6 2667 if (!newly_created && f->writable) {
baed47c3 2668 r = journal_file_fss_load(f);
0284adc6
LP
2669 if (r < 0)
2670 goto fail;
2671 }
feb12d3e 2672#endif
cec736d2
LP
2673
2674 if (f->writable) {
4a92baf3
LP
2675 if (metrics) {
2676 journal_default_metrics(metrics, f->fd);
2677 f->metrics = *metrics;
2678 } else if (template)
2679 f->metrics = template->metrics;
2680
cec736d2
LP
2681 r = journal_file_refresh_header(f);
2682 if (r < 0)
2683 goto fail;
2684 }
2685
feb12d3e 2686#ifdef HAVE_GCRYPT
baed47c3 2687 r = journal_file_hmac_setup(f);
14d10188
LP
2688 if (r < 0)
2689 goto fail;
feb12d3e 2690#endif
14d10188 2691
cec736d2 2692 if (newly_created) {
de190aef 2693 r = journal_file_setup_field_hash_table(f);
cec736d2
LP
2694 if (r < 0)
2695 goto fail;
2696
de190aef 2697 r = journal_file_setup_data_hash_table(f);
cec736d2
LP
2698 if (r < 0)
2699 goto fail;
7560fffc 2700
feb12d3e 2701#ifdef HAVE_GCRYPT
7560fffc
LP
2702 r = journal_file_append_first_tag(f);
2703 if (r < 0)
2704 goto fail;
feb12d3e 2705#endif
cec736d2
LP
2706 }
2707
de190aef 2708 r = journal_file_map_field_hash_table(f);
cec736d2
LP
2709 if (r < 0)
2710 goto fail;
2711
de190aef 2712 r = journal_file_map_data_hash_table(f);
cec736d2
LP
2713 if (r < 0)
2714 goto fail;
2715
fa6ac760
LP
2716 if (mmap_cache_got_sigbus(f->mmap, f->fd)) {
2717 r = -EIO;
2718 goto fail;
2719 }
2720
0559d3a5 2721 *ret = f;
cec736d2
LP
2722 return 0;
2723
2724fail:
fa6ac760
LP
2725 if (f->fd >= 0 && mmap_cache_got_sigbus(f->mmap, f->fd))
2726 r = -EIO;
2727
cec736d2
LP
2728 journal_file_close(f);
2729
2730 return r;
2731}
0ac38b70 2732
baed47c3 2733int journal_file_rotate(JournalFile **f, bool compress, bool seal) {
57535f47 2734 _cleanup_free_ char *p = NULL;
0ac38b70
LP
2735 size_t l;
2736 JournalFile *old_file, *new_file = NULL;
2737 int r;
2738
2739 assert(f);
2740 assert(*f);
2741
2742 old_file = *f;
2743
2744 if (!old_file->writable)
2745 return -EINVAL;
2746
2747 if (!endswith(old_file->path, ".journal"))
2748 return -EINVAL;
2749
2750 l = strlen(old_file->path);
57535f47
ZJS
2751 r = asprintf(&p, "%.*s@" SD_ID128_FORMAT_STR "-%016"PRIx64"-%016"PRIx64".journal",
2752 (int) l - 8, old_file->path,
2753 SD_ID128_FORMAT_VAL(old_file->header->seqnum_id),
2754 le64toh((*f)->header->head_entry_seqnum),
2755 le64toh((*f)->header->head_entry_realtime));
2756 if (r < 0)
0ac38b70
LP
2757 return -ENOMEM;
2758
2678031a
LP
2759 /* Try to rename the file to the archived version. If the file
2760 * already was deleted, we'll get ENOENT, let's ignore that
2761 * case. */
0ac38b70 2762 r = rename(old_file->path, p);
2678031a 2763 if (r < 0 && errno != ENOENT)
0ac38b70
LP
2764 return -errno;
2765
ccdbaf91 2766 old_file->header->state = STATE_ARCHIVED;
0ac38b70 2767
f27a3864
LP
2768 /* Currently, btrfs is not very good with out write patterns
2769 * and fragments heavily. Let's defrag our journal files when
2770 * we archive them */
2771 old_file->defrag_on_close = true;
2772
baed47c3 2773 r = journal_file_open(old_file->path, old_file->flags, old_file->mode, compress, seal, NULL, old_file->mmap, old_file, &new_file);
0ac38b70
LP
2774 journal_file_close(old_file);
2775
2776 *f = new_file;
2777 return r;
2778}
2779
9447a7f1
LP
2780int journal_file_open_reliably(
2781 const char *fname,
2782 int flags,
2783 mode_t mode,
7560fffc 2784 bool compress,
baed47c3 2785 bool seal,
4a92baf3 2786 JournalMetrics *metrics,
27370278 2787 MMapCache *mmap_cache,
9447a7f1
LP
2788 JournalFile *template,
2789 JournalFile **ret) {
2790
2791 int r;
2792 size_t l;
ed375beb 2793 _cleanup_free_ char *p = NULL;
9447a7f1 2794
baed47c3 2795 r = journal_file_open(fname, flags, mode, compress, seal,
27370278 2796 metrics, mmap_cache, template, ret);
288359db
ZJS
2797 if (!IN_SET(r,
2798 -EBADMSG, /* corrupted */
2799 -ENODATA, /* truncated */
2800 -EHOSTDOWN, /* other machine */
2801 -EPROTONOSUPPORT, /* incompatible feature */
2802 -EBUSY, /* unclean shutdown */
2803 -ESHUTDOWN, /* already archived */
2804 -EIO, /* IO error, including SIGBUS on mmap */
2805 -EIDRM /* File has been deleted */))
9447a7f1
LP
2806 return r;
2807
2808 if ((flags & O_ACCMODE) == O_RDONLY)
2809 return r;
2810
2811 if (!(flags & O_CREAT))
2812 return r;
2813
7560fffc
LP
2814 if (!endswith(fname, ".journal"))
2815 return r;
2816
5c70eab4
LP
2817 /* The file is corrupted. Rotate it away and try it again (but only once) */
2818
9447a7f1 2819 l = strlen(fname);
d587eca5 2820 if (asprintf(&p, "%.*s@%016"PRIx64 "-%016"PRIx64 ".journal~",
57535f47 2821 (int) l - 8, fname,
d587eca5 2822 now(CLOCK_REALTIME),
9bf3b535 2823 random_u64()) < 0)
9447a7f1
LP
2824 return -ENOMEM;
2825
2826 r = rename(fname, p);
9447a7f1
LP
2827 if (r < 0)
2828 return -errno;
2829
f27a3864
LP
2830 /* btrfs doesn't cope well with our write pattern and
2831 * fragments heavily. Let's defrag all files we rotate */
11689d2a
LP
2832
2833 (void) chattr_path(p, false, FS_NOCOW_FL);
f27a3864
LP
2834 (void) btrfs_defrag(p);
2835
a1a1898f 2836 log_warning("File %s corrupted or uncleanly shut down, renaming and replacing.", fname);
9447a7f1 2837
baed47c3 2838 return journal_file_open(fname, flags, mode, compress, seal,
27370278 2839 metrics, mmap_cache, template, ret);
9447a7f1
LP
2840}
2841
cf244689
LP
2842int journal_file_copy_entry(JournalFile *from, JournalFile *to, Object *o, uint64_t p, uint64_t *seqnum, Object **ret, uint64_t *offset) {
2843 uint64_t i, n;
2844 uint64_t q, xor_hash = 0;
2845 int r;
2846 EntryItem *items;
2847 dual_timestamp ts;
2848
2849 assert(from);
2850 assert(to);
2851 assert(o);
2852 assert(p);
2853
2854 if (!to->writable)
2855 return -EPERM;
2856
2857 ts.monotonic = le64toh(o->entry.monotonic);
2858 ts.realtime = le64toh(o->entry.realtime);
2859
cf244689 2860 n = journal_file_entry_n_items(o);
4faa7004
TA
2861 /* alloca() can't take 0, hence let's allocate at least one */
2862 items = alloca(sizeof(EntryItem) * MAX(1u, n));
cf244689
LP
2863
2864 for (i = 0; i < n; i++) {
4fd052ae
FC
2865 uint64_t l, h;
2866 le64_t le_hash;
cf244689
LP
2867 size_t t;
2868 void *data;
2869 Object *u;
2870
2871 q = le64toh(o->entry.items[i].object_offset);
2872 le_hash = o->entry.items[i].hash;
2873
2874 r = journal_file_move_to_object(from, OBJECT_DATA, q, &o);
2875 if (r < 0)
2876 return r;
2877
2878 if (le_hash != o->data.hash)
2879 return -EBADMSG;
2880
2881 l = le64toh(o->object.size) - offsetof(Object, data.payload);
2882 t = (size_t) l;
2883
2884 /* We hit the limit on 32bit machines */
2885 if ((uint64_t) t != l)
2886 return -E2BIG;
2887
d89c8fdf 2888 if (o->object.flags & OBJECT_COMPRESSION_MASK) {
3b1a55e1 2889#if defined(HAVE_XZ) || defined(HAVE_LZ4)
a7f7d1bd 2890 size_t rsize = 0;
cf244689 2891
d89c8fdf
ZJS
2892 r = decompress_blob(o->object.flags & OBJECT_COMPRESSION_MASK,
2893 o->data.payload, l, &from->compress_buffer, &from->compress_buffer_size, &rsize, 0);
2894 if (r < 0)
2895 return r;
cf244689
LP
2896
2897 data = from->compress_buffer;
2898 l = rsize;
3b1a55e1
ZJS
2899#else
2900 return -EPROTONOSUPPORT;
2901#endif
cf244689
LP
2902 } else
2903 data = o->data.payload;
2904
2905 r = journal_file_append_data(to, data, l, &u, &h);
2906 if (r < 0)
2907 return r;
2908
2909 xor_hash ^= le64toh(u->data.hash);
2910 items[i].object_offset = htole64(h);
2911 items[i].hash = u->data.hash;
2912
2913 r = journal_file_move_to_object(from, OBJECT_ENTRY, p, &o);
2914 if (r < 0)
2915 return r;
2916 }
2917
fa6ac760
LP
2918 r = journal_file_append_entry_internal(to, &ts, xor_hash, items, n, seqnum, ret, offset);
2919
2920 if (mmap_cache_got_sigbus(to->mmap, to->fd))
2921 return -EIO;
2922
2923 return r;
cf244689 2924}
babfc091
LP
2925
2926void journal_default_metrics(JournalMetrics *m, int fd) {
2927 uint64_t fs_size = 0;
2928 struct statvfs ss;
a7bc2c2a 2929 char a[FORMAT_BYTES_MAX], b[FORMAT_BYTES_MAX], c[FORMAT_BYTES_MAX], d[FORMAT_BYTES_MAX];
babfc091
LP
2930
2931 assert(m);
2932 assert(fd >= 0);
2933
2934 if (fstatvfs(fd, &ss) >= 0)
2935 fs_size = ss.f_frsize * ss.f_blocks;
2936
2937 if (m->max_use == (uint64_t) -1) {
2938
2939 if (fs_size > 0) {
2940 m->max_use = PAGE_ALIGN(fs_size / 10); /* 10% of file system size */
2941
2942 if (m->max_use > DEFAULT_MAX_USE_UPPER)
2943 m->max_use = DEFAULT_MAX_USE_UPPER;
2944
2945 if (m->max_use < DEFAULT_MAX_USE_LOWER)
2946 m->max_use = DEFAULT_MAX_USE_LOWER;
2947 } else
2948 m->max_use = DEFAULT_MAX_USE_LOWER;
2949 } else {
2950 m->max_use = PAGE_ALIGN(m->max_use);
2951
2952 if (m->max_use < JOURNAL_FILE_SIZE_MIN*2)
2953 m->max_use = JOURNAL_FILE_SIZE_MIN*2;
2954 }
2955
2956 if (m->max_size == (uint64_t) -1) {
2957 m->max_size = PAGE_ALIGN(m->max_use / 8); /* 8 chunks */
2958
2959 if (m->max_size > DEFAULT_MAX_SIZE_UPPER)
2960 m->max_size = DEFAULT_MAX_SIZE_UPPER;
2961 } else
2962 m->max_size = PAGE_ALIGN(m->max_size);
2963
2964 if (m->max_size < JOURNAL_FILE_SIZE_MIN)
2965 m->max_size = JOURNAL_FILE_SIZE_MIN;
2966
2967 if (m->max_size*2 > m->max_use)
2968 m->max_use = m->max_size*2;
2969
2970 if (m->min_size == (uint64_t) -1)
2971 m->min_size = JOURNAL_FILE_SIZE_MIN;
2972 else {
2973 m->min_size = PAGE_ALIGN(m->min_size);
2974
2975 if (m->min_size < JOURNAL_FILE_SIZE_MIN)
2976 m->min_size = JOURNAL_FILE_SIZE_MIN;
2977
2978 if (m->min_size > m->max_size)
2979 m->max_size = m->min_size;
2980 }
2981
2982 if (m->keep_free == (uint64_t) -1) {
2983
2984 if (fs_size > 0) {
8621b110 2985 m->keep_free = PAGE_ALIGN(fs_size * 3 / 20); /* 15% of file system size */
babfc091
LP
2986
2987 if (m->keep_free > DEFAULT_KEEP_FREE_UPPER)
2988 m->keep_free = DEFAULT_KEEP_FREE_UPPER;
2989
2990 } else
2991 m->keep_free = DEFAULT_KEEP_FREE;
2992 }
2993
2b43f939
LP
2994 log_debug("Fixed max_use=%s max_size=%s min_size=%s keep_free=%s",
2995 format_bytes(a, sizeof(a), m->max_use),
2996 format_bytes(b, sizeof(b), m->max_size),
2997 format_bytes(c, sizeof(c), m->min_size),
2998 format_bytes(d, sizeof(d), m->keep_free));
babfc091 2999}
08984293
LP
3000
3001int journal_file_get_cutoff_realtime_usec(JournalFile *f, usec_t *from, usec_t *to) {
08984293
LP
3002 assert(f);
3003 assert(from || to);
3004
3005 if (from) {
162566a4
LP
3006 if (f->header->head_entry_realtime == 0)
3007 return -ENOENT;
08984293 3008
162566a4 3009 *from = le64toh(f->header->head_entry_realtime);
08984293
LP
3010 }
3011
3012 if (to) {
162566a4
LP
3013 if (f->header->tail_entry_realtime == 0)
3014 return -ENOENT;
08984293 3015
162566a4 3016 *to = le64toh(f->header->tail_entry_realtime);
08984293
LP
3017 }
3018
3019 return 1;
3020}
3021
3022int journal_file_get_cutoff_monotonic_usec(JournalFile *f, sd_id128_t boot_id, usec_t *from, usec_t *to) {
08984293
LP
3023 Object *o;
3024 uint64_t p;
3025 int r;
3026
3027 assert(f);
3028 assert(from || to);
3029
47838ab3 3030 r = find_data_object_by_boot_id(f, boot_id, &o, &p);
08984293
LP
3031 if (r <= 0)
3032 return r;
3033
3034 if (le64toh(o->data.n_entries) <= 0)
3035 return 0;
3036
3037 if (from) {
3038 r = journal_file_move_to_object(f, OBJECT_ENTRY, le64toh(o->data.entry_offset), &o);
3039 if (r < 0)
3040 return r;
3041
3042 *from = le64toh(o->entry.monotonic);
3043 }
3044
3045 if (to) {
3046 r = journal_file_move_to_object(f, OBJECT_DATA, p, &o);
3047 if (r < 0)
3048 return r;
3049
3050 r = generic_array_get_plus_one(f,
3051 le64toh(o->data.entry_offset),
3052 le64toh(o->data.entry_array_offset),
3053 le64toh(o->data.n_entries)-1,
3054 &o, NULL);
3055 if (r <= 0)
3056 return r;
3057
3058 *to = le64toh(o->entry.monotonic);
3059 }
3060
3061 return 1;
3062}
dca6219e 3063
fb0951b0 3064bool journal_file_rotate_suggested(JournalFile *f, usec_t max_file_usec) {
dca6219e
LP
3065 assert(f);
3066
3067 /* If we gained new header fields we gained new features,
3068 * hence suggest a rotation */
361f9cbc
LP
3069 if (le64toh(f->header->header_size) < sizeof(Header)) {
3070 log_debug("%s uses an outdated header, suggesting rotation.", f->path);
dca6219e 3071 return true;
361f9cbc 3072 }
dca6219e
LP
3073
3074 /* Let's check if the hash tables grew over a certain fill
3075 * level (75%, borrowing this value from Java's hash table
3076 * implementation), and if so suggest a rotation. To calculate
3077 * the fill level we need the n_data field, which only exists
3078 * in newer versions. */
3079
3080 if (JOURNAL_HEADER_CONTAINS(f->header, n_data))
361f9cbc 3081 if (le64toh(f->header->n_data) * 4ULL > (le64toh(f->header->data_hash_table_size) / sizeof(HashItem)) * 3ULL) {
507f22bd 3082 log_debug("Data hash table of %s has a fill level at %.1f (%"PRIu64" of %"PRIu64" items, %llu file size, %"PRIu64" bytes per hash table item), suggesting rotation.",
361f9cbc
LP
3083 f->path,
3084 100.0 * (double) le64toh(f->header->n_data) / ((double) (le64toh(f->header->data_hash_table_size) / sizeof(HashItem))),
507f22bd
ZJS
3085 le64toh(f->header->n_data),
3086 le64toh(f->header->data_hash_table_size) / sizeof(HashItem),
3087 (unsigned long long) f->last_stat.st_size,
3088 f->last_stat.st_size / le64toh(f->header->n_data));
dca6219e 3089 return true;
361f9cbc 3090 }
dca6219e
LP
3091
3092 if (JOURNAL_HEADER_CONTAINS(f->header, n_fields))
361f9cbc 3093 if (le64toh(f->header->n_fields) * 4ULL > (le64toh(f->header->field_hash_table_size) / sizeof(HashItem)) * 3ULL) {
507f22bd 3094 log_debug("Field hash table of %s has a fill level at %.1f (%"PRIu64" of %"PRIu64" items), suggesting rotation.",
361f9cbc
LP
3095 f->path,
3096 100.0 * (double) le64toh(f->header->n_fields) / ((double) (le64toh(f->header->field_hash_table_size) / sizeof(HashItem))),
507f22bd
ZJS
3097 le64toh(f->header->n_fields),
3098 le64toh(f->header->field_hash_table_size) / sizeof(HashItem));
dca6219e 3099 return true;
361f9cbc 3100 }
dca6219e 3101
0598fd4a
LP
3102 /* Are the data objects properly indexed by field objects? */
3103 if (JOURNAL_HEADER_CONTAINS(f->header, n_data) &&
3104 JOURNAL_HEADER_CONTAINS(f->header, n_fields) &&
3105 le64toh(f->header->n_data) > 0 &&
3106 le64toh(f->header->n_fields) == 0)
3107 return true;
3108
fb0951b0
LP
3109 if (max_file_usec > 0) {
3110 usec_t t, h;
3111
3112 h = le64toh(f->header->head_entry_realtime);
3113 t = now(CLOCK_REALTIME);
3114
3115 if (h > 0 && t > h + max_file_usec)
3116 return true;
3117 }
3118
dca6219e
LP
3119 return false;
3120}