]> git.ipfire.org Git - thirdparty/systemd.git/blame - src/journal/journal-file.c
Bug #944: Replacement of a free() call by mfree()
[thirdparty/systemd.git] / src / journal / journal-file.c
CommitLineData
cec736d2
LP
1/*-*- Mode: C; c-basic-offset: 8; indent-tabs-mode: nil -*-*/
2
3/***
4 This file is part of systemd.
5
6 Copyright 2011 Lennart Poettering
7
8 systemd is free software; you can redistribute it and/or modify it
5430f7f2
LP
9 under the terms of the GNU Lesser General Public License as published by
10 the Free Software Foundation; either version 2.1 of the License, or
cec736d2
LP
11 (at your option) any later version.
12
13 systemd is distributed in the hope that it will be useful, but
14 WITHOUT ANY WARRANTY; without even the implied warranty of
15 MERCHANTABILITY or FITNESS FOR A PARTICULAR PURPOSE. See the GNU
5430f7f2 16 Lesser General Public License for more details.
cec736d2 17
5430f7f2 18 You should have received a copy of the GNU Lesser General Public License
cec736d2
LP
19 along with systemd; If not, see <http://www.gnu.org/licenses/>.
20***/
21
22#include <sys/mman.h>
23#include <errno.h>
24#include <sys/uio.h>
25#include <unistd.h>
26#include <sys/statvfs.h>
27#include <fcntl.h>
28#include <stddef.h>
11689d2a 29#include <linux/fs.h>
fb0951b0 30
f27a3864 31#include "btrfs-util.h"
cec736d2
LP
32#include "journal-def.h"
33#include "journal-file.h"
0284adc6 34#include "journal-authenticate.h"
cec736d2 35#include "lookup3.h"
807e17f0 36#include "compress.h"
3df3e884 37#include "random-util.h"
cec736d2 38
4a92baf3
LP
39#define DEFAULT_DATA_HASH_TABLE_SIZE (2047ULL*sizeof(HashItem))
40#define DEFAULT_FIELD_HASH_TABLE_SIZE (333ULL*sizeof(HashItem))
cec736d2 41
be19b7df 42#define COMPRESSION_SIZE_THRESHOLD (512ULL)
807e17f0 43
babfc091 44/* This is the minimum journal file size */
253f59df 45#define JOURNAL_FILE_SIZE_MIN (4ULL*1024ULL*1024ULL) /* 4 MiB */
babfc091
LP
46
47/* These are the lower and upper bounds if we deduce the max_use value
48 * from the file system size */
49#define DEFAULT_MAX_USE_LOWER (1ULL*1024ULL*1024ULL) /* 1 MiB */
50#define DEFAULT_MAX_USE_UPPER (4ULL*1024ULL*1024ULL*1024ULL) /* 4 GiB */
51
52/* This is the upper bound if we deduce max_size from max_use */
71100051 53#define DEFAULT_MAX_SIZE_UPPER (128ULL*1024ULL*1024ULL) /* 128 MiB */
babfc091
LP
54
55/* This is the upper bound if we deduce the keep_free value from the
56 * file system size */
57#define DEFAULT_KEEP_FREE_UPPER (4ULL*1024ULL*1024ULL*1024ULL) /* 4 GiB */
58
59/* This is the keep_free value when we can't determine the system
60 * size */
61#define DEFAULT_KEEP_FREE (1024ULL*1024ULL) /* 1 MB */
62
dca6219e
LP
63/* n_data was the first entry we added after the initial file format design */
64#define HEADER_SIZE_MIN ALIGN64(offsetof(Header, n_data))
cec736d2 65
a4bcff5b
LP
66/* How many entries to keep in the entry array chain cache at max */
67#define CHAIN_CACHE_MAX 20
68
a676e665
LP
69/* How much to increase the journal file size at once each time we allocate something new. */
70#define FILE_SIZE_INCREASE (8ULL*1024ULL*1024ULL) /* 8MB */
71
2678031a
LP
72/* Reread fstat() of the file for detecting deletions at least this often */
73#define LAST_STAT_REFRESH_USEC (5*USEC_PER_SEC)
74
fa6ac760
LP
75/* The mmap context to use for the header we pick as one above the last defined typed */
76#define CONTEXT_HEADER _OBJECT_TYPE_MAX
77
9588bc32 78static int journal_file_set_online(JournalFile *f) {
26687bf8
OS
79 assert(f);
80
81 if (!f->writable)
82 return -EPERM;
83
84 if (!(f->fd >= 0 && f->header))
85 return -EINVAL;
86
fa6ac760
LP
87 if (mmap_cache_got_sigbus(f->mmap, f->fd))
88 return -EIO;
89
26687bf8
OS
90 switch(f->header->state) {
91 case STATE_ONLINE:
92 return 0;
93
94 case STATE_OFFLINE:
95 f->header->state = STATE_ONLINE;
96 fsync(f->fd);
97 return 0;
98
99 default:
100 return -EINVAL;
101 }
102}
103
104int journal_file_set_offline(JournalFile *f) {
105 assert(f);
106
107 if (!f->writable)
108 return -EPERM;
109
110 if (!(f->fd >= 0 && f->header))
111 return -EINVAL;
112
113 if (f->header->state != STATE_ONLINE)
114 return 0;
115
116 fsync(f->fd);
117
fa6ac760
LP
118 if (mmap_cache_got_sigbus(f->mmap, f->fd))
119 return -EIO;
120
26687bf8
OS
121 f->header->state = STATE_OFFLINE;
122
fa6ac760
LP
123 if (mmap_cache_got_sigbus(f->mmap, f->fd))
124 return -EIO;
125
26687bf8
OS
126 fsync(f->fd);
127
128 return 0;
129}
130
cec736d2 131void journal_file_close(JournalFile *f) {
de190aef 132 assert(f);
cec736d2 133
feb12d3e 134#ifdef HAVE_GCRYPT
b0af6f41 135 /* Write the final tag */
c586dbf1 136 if (f->seal && f->writable)
b0af6f41 137 journal_file_append_tag(f);
feb12d3e 138#endif
b0af6f41 139
26687bf8 140 journal_file_set_offline(f);
cec736d2 141
fa6ac760
LP
142 if (f->mmap && f->fd >= 0)
143 mmap_cache_close_fd(f->mmap, f->fd);
cec736d2 144
11689d2a
LP
145 if (f->fd >= 0 && f->defrag_on_close) {
146
147 /* Be friendly to btrfs: turn COW back on again now,
148 * and defragment the file. We won't write to the file
149 * ever again, hence remove all fragmentation, and
150 * reenable all the good bits COW usually provides
151 * (such as data checksumming). */
152
1ed8f8c1 153 (void) chattr_fd(f->fd, 0, FS_NOCOW_FL);
11689d2a
LP
154 (void) btrfs_defrag_fd(f->fd);
155 }
f27a3864 156
03e334a1 157 safe_close(f->fd);
cec736d2 158 free(f->path);
807e17f0 159
16e9f408
LP
160 if (f->mmap)
161 mmap_cache_unref(f->mmap);
162
4743015d 163 ordered_hashmap_free_free(f->chain_cache);
a4bcff5b 164
d89c8fdf 165#if defined(HAVE_XZ) || defined(HAVE_LZ4)
807e17f0
LP
166 free(f->compress_buffer);
167#endif
168
7560fffc 169#ifdef HAVE_GCRYPT
baed47c3
LP
170 if (f->fss_file)
171 munmap(f->fss_file, PAGE_ALIGN(f->fss_file_size));
b7c9ae91
LP
172 else if (f->fsprg_state)
173 free(f->fsprg_state);
174
175 free(f->fsprg_seed);
7560fffc
LP
176
177 if (f->hmac)
178 gcry_md_close(f->hmac);
179#endif
180
cec736d2
LP
181 free(f);
182}
183
0ac38b70 184static int journal_file_init_header(JournalFile *f, JournalFile *template) {
d89c8fdf 185 Header h = {};
cec736d2
LP
186 ssize_t k;
187 int r;
188
189 assert(f);
190
7560fffc 191 memcpy(h.signature, HEADER_SIGNATURE, 8);
23b0b2b2 192 h.header_size = htole64(ALIGN64(sizeof(h)));
cec736d2 193
d89c8fdf
ZJS
194 h.incompatible_flags |= htole32(
195 f->compress_xz * HEADER_INCOMPATIBLE_COMPRESSED_XZ |
196 f->compress_lz4 * HEADER_INCOMPATIBLE_COMPRESSED_LZ4);
7560fffc 197
d89c8fdf
ZJS
198 h.compatible_flags = htole32(
199 f->seal * HEADER_COMPATIBLE_SEALED);
7560fffc 200
cec736d2
LP
201 r = sd_id128_randomize(&h.file_id);
202 if (r < 0)
203 return r;
204
0ac38b70
LP
205 if (template) {
206 h.seqnum_id = template->header->seqnum_id;
beec0085 207 h.tail_entry_seqnum = template->header->tail_entry_seqnum;
0ac38b70
LP
208 } else
209 h.seqnum_id = h.file_id;
cec736d2
LP
210
211 k = pwrite(f->fd, &h, sizeof(h), 0);
212 if (k < 0)
213 return -errno;
214
215 if (k != sizeof(h))
216 return -EIO;
217
218 return 0;
219}
220
221static int journal_file_refresh_header(JournalFile *f) {
de190aef 222 sd_id128_t boot_id;
fa6ac760 223 int r;
cec736d2
LP
224
225 assert(f);
226
227 r = sd_id128_get_machine(&f->header->machine_id);
228 if (r < 0)
229 return r;
230
de190aef 231 r = sd_id128_get_boot(&boot_id);
cec736d2
LP
232 if (r < 0)
233 return r;
234
de190aef
LP
235 if (sd_id128_equal(boot_id, f->header->boot_id))
236 f->tail_entry_monotonic_valid = true;
237
238 f->header->boot_id = boot_id;
239
fa6ac760 240 r = journal_file_set_online(f);
b788cc23 241
7560fffc 242 /* Sync the online state to disk */
a676e665 243 fsync(f->fd);
b788cc23 244
fa6ac760 245 return r;
cec736d2
LP
246}
247
248static int journal_file_verify_header(JournalFile *f) {
d89c8fdf
ZJS
249 uint32_t flags;
250
cec736d2
LP
251 assert(f);
252
7560fffc 253 if (memcmp(f->header->signature, HEADER_SIGNATURE, 8))
cec736d2
LP
254 return -EBADMSG;
255
7560fffc
LP
256 /* In both read and write mode we refuse to open files with
257 * incompatible flags we don't know */
d89c8fdf
ZJS
258 flags = le32toh(f->header->incompatible_flags);
259 if (flags & ~HEADER_INCOMPATIBLE_SUPPORTED) {
260 if (flags & ~HEADER_INCOMPATIBLE_ANY)
261 log_debug("Journal file %s has unknown incompatible flags %"PRIx32,
262 f->path, flags & ~HEADER_INCOMPATIBLE_ANY);
263 flags = (flags & HEADER_INCOMPATIBLE_ANY) & ~HEADER_INCOMPATIBLE_SUPPORTED;
264 if (flags)
265 log_debug("Journal file %s uses incompatible flags %"PRIx32
266 " disabled at compilation time.", f->path, flags);
cec736d2 267 return -EPROTONOSUPPORT;
d89c8fdf 268 }
cec736d2 269
7560fffc
LP
270 /* When open for writing we refuse to open files with
271 * compatible flags, too */
d89c8fdf
ZJS
272 flags = le32toh(f->header->compatible_flags);
273 if (f->writable && (flags & ~HEADER_COMPATIBLE_SUPPORTED)) {
274 if (flags & ~HEADER_COMPATIBLE_ANY)
275 log_debug("Journal file %s has unknown compatible flags %"PRIx32,
276 f->path, flags & ~HEADER_COMPATIBLE_ANY);
277 flags = (flags & HEADER_COMPATIBLE_ANY) & ~HEADER_COMPATIBLE_SUPPORTED;
278 if (flags)
279 log_debug("Journal file %s uses compatible flags %"PRIx32
280 " disabled at compilation time.", f->path, flags);
281 return -EPROTONOSUPPORT;
7560fffc
LP
282 }
283
db11ac1a
LP
284 if (f->header->state >= _STATE_MAX)
285 return -EBADMSG;
286
dca6219e
LP
287 /* The first addition was n_data, so check that we are at least this large */
288 if (le64toh(f->header->header_size) < HEADER_SIZE_MIN)
23b0b2b2
LP
289 return -EBADMSG;
290
8088cbd3 291 if (JOURNAL_HEADER_SEALED(f->header) && !JOURNAL_HEADER_CONTAINS(f->header, n_entry_arrays))
beec0085
LP
292 return -EBADMSG;
293
db11ac1a
LP
294 if ((le64toh(f->header->header_size) + le64toh(f->header->arena_size)) > (uint64_t) f->last_stat.st_size)
295 return -ENODATA;
296
297 if (le64toh(f->header->tail_object_offset) > (le64toh(f->header->header_size) + le64toh(f->header->arena_size)))
298 return -ENODATA;
299
7762e02b
LP
300 if (!VALID64(le64toh(f->header->data_hash_table_offset)) ||
301 !VALID64(le64toh(f->header->field_hash_table_offset)) ||
302 !VALID64(le64toh(f->header->tail_object_offset)) ||
303 !VALID64(le64toh(f->header->entry_array_offset)))
304 return -ENODATA;
305
cec736d2 306 if (f->writable) {
ccdbaf91 307 uint8_t state;
cec736d2
LP
308 sd_id128_t machine_id;
309 int r;
310
311 r = sd_id128_get_machine(&machine_id);
312 if (r < 0)
313 return r;
314
315 if (!sd_id128_equal(machine_id, f->header->machine_id))
316 return -EHOSTDOWN;
317
de190aef 318 state = f->header->state;
cec736d2 319
71fa6f00
LP
320 if (state == STATE_ONLINE) {
321 log_debug("Journal file %s is already online. Assuming unclean closing.", f->path);
322 return -EBUSY;
323 } else if (state == STATE_ARCHIVED)
cec736d2 324 return -ESHUTDOWN;
71fa6f00 325 else if (state != STATE_OFFLINE) {
8facc349 326 log_debug("Journal file %s has unknown state %i.", f->path, state);
71fa6f00
LP
327 return -EBUSY;
328 }
cec736d2
LP
329 }
330
d89c8fdf
ZJS
331 f->compress_xz = JOURNAL_HEADER_COMPRESSED_XZ(f->header);
332 f->compress_lz4 = JOURNAL_HEADER_COMPRESSED_LZ4(f->header);
c586dbf1 333
f1889c91 334 f->seal = JOURNAL_HEADER_SEALED(f->header);
7560fffc 335
cec736d2
LP
336 return 0;
337}
338
2678031a
LP
339static int journal_file_fstat(JournalFile *f) {
340 assert(f);
341 assert(f->fd >= 0);
342
343 if (fstat(f->fd, &f->last_stat) < 0)
344 return -errno;
345
346 f->last_stat_usec = now(CLOCK_MONOTONIC);
347
348 /* Refuse appending to files that are already deleted */
349 if (f->last_stat.st_nlink <= 0)
350 return -EIDRM;
351
352 return 0;
353}
354
cec736d2 355static int journal_file_allocate(JournalFile *f, uint64_t offset, uint64_t size) {
eda4b58b 356 uint64_t old_size, new_size;
fec2aa2f 357 int r;
cec736d2
LP
358
359 assert(f);
360
cec736d2 361 /* We assume that this file is not sparse, and we know that
38ac38b2 362 * for sure, since we always call posix_fallocate()
cec736d2
LP
363 * ourselves */
364
fa6ac760
LP
365 if (mmap_cache_got_sigbus(f->mmap, f->fd))
366 return -EIO;
367
cec736d2 368 old_size =
23b0b2b2 369 le64toh(f->header->header_size) +
cec736d2
LP
370 le64toh(f->header->arena_size);
371
bc85bfee 372 new_size = PAGE_ALIGN(offset + size);
23b0b2b2
LP
373 if (new_size < le64toh(f->header->header_size))
374 new_size = le64toh(f->header->header_size);
bc85bfee 375
2678031a
LP
376 if (new_size <= old_size) {
377
378 /* We already pre-allocated enough space, but before
379 * we write to it, let's check with fstat() if the
380 * file got deleted, in order make sure we don't throw
381 * away the data immediately. Don't check fstat() for
382 * all writes though, but only once ever 10s. */
383
384 if (f->last_stat_usec + LAST_STAT_REFRESH_USEC > now(CLOCK_MONOTONIC))
385 return 0;
386
387 return journal_file_fstat(f);
388 }
389
390 /* Allocate more space. */
cec736d2 391
a676e665 392 if (f->metrics.max_size > 0 && new_size > f->metrics.max_size)
bc85bfee 393 return -E2BIG;
cec736d2 394
a676e665 395 if (new_size > f->metrics.min_size && f->metrics.keep_free > 0) {
cec736d2
LP
396 struct statvfs svfs;
397
398 if (fstatvfs(f->fd, &svfs) >= 0) {
399 uint64_t available;
400
401 available = svfs.f_bfree * svfs.f_bsize;
402
bc85bfee
LP
403 if (available >= f->metrics.keep_free)
404 available -= f->metrics.keep_free;
cec736d2
LP
405 else
406 available = 0;
407
408 if (new_size - old_size > available)
409 return -E2BIG;
410 }
411 }
412
eda4b58b
LP
413 /* Increase by larger blocks at once */
414 new_size = ((new_size+FILE_SIZE_INCREASE-1) / FILE_SIZE_INCREASE) * FILE_SIZE_INCREASE;
415 if (f->metrics.max_size > 0 && new_size > f->metrics.max_size)
416 new_size = f->metrics.max_size;
417
bc85bfee
LP
418 /* Note that the glibc fallocate() fallback is very
419 inefficient, hence we try to minimize the allocation area
420 as we can. */
fec2aa2f
GV
421 r = posix_fallocate(f->fd, old_size, new_size - old_size);
422 if (r != 0)
423 return -r;
cec736d2 424
23b0b2b2 425 f->header->arena_size = htole64(new_size - le64toh(f->header->header_size));
cec736d2 426
2678031a 427 return journal_file_fstat(f);
cec736d2
LP
428}
429
78519831 430static unsigned type_to_context(ObjectType type) {
d3d3208f 431 /* One context for each type, plus one catch-all for the rest */
69adae51 432 assert_cc(_OBJECT_TYPE_MAX <= MMAP_CACHE_MAX_CONTEXTS);
fa6ac760 433 assert_cc(CONTEXT_HEADER < MMAP_CACHE_MAX_CONTEXTS);
d05089d8 434 return type > OBJECT_UNUSED && type < _OBJECT_TYPE_MAX ? type : 0;
d3d3208f
MS
435}
436
7a9dabea 437static int journal_file_move_to(JournalFile *f, ObjectType type, bool keep_always, uint64_t offset, uint64_t size, void **ret) {
2678031a
LP
438 int r;
439
cec736d2 440 assert(f);
cec736d2
LP
441 assert(ret);
442
7762e02b
LP
443 if (size <= 0)
444 return -EINVAL;
445
2a59ea54 446 /* Avoid SIGBUS on invalid accesses */
4bbdcdb3
LP
447 if (offset + size > (uint64_t) f->last_stat.st_size) {
448 /* Hmm, out of range? Let's refresh the fstat() data
449 * first, before we trust that check. */
450
2678031a
LP
451 r = journal_file_fstat(f);
452 if (r < 0)
453 return r;
454
455 if (offset + size > (uint64_t) f->last_stat.st_size)
4bbdcdb3
LP
456 return -EADDRNOTAVAIL;
457 }
458
7a9dabea 459 return mmap_cache_get(f->mmap, f->fd, f->prot, type_to_context(type), keep_always, offset, size, &f->last_stat, ret);
cec736d2
LP
460}
461
16e9f408
LP
462static uint64_t minimum_header_size(Object *o) {
463
b8e891e6 464 static const uint64_t table[] = {
16e9f408
LP
465 [OBJECT_DATA] = sizeof(DataObject),
466 [OBJECT_FIELD] = sizeof(FieldObject),
467 [OBJECT_ENTRY] = sizeof(EntryObject),
468 [OBJECT_DATA_HASH_TABLE] = sizeof(HashTableObject),
469 [OBJECT_FIELD_HASH_TABLE] = sizeof(HashTableObject),
470 [OBJECT_ENTRY_ARRAY] = sizeof(EntryArrayObject),
471 [OBJECT_TAG] = sizeof(TagObject),
472 };
473
474 if (o->object.type >= ELEMENTSOF(table) || table[o->object.type] <= 0)
475 return sizeof(ObjectHeader);
476
477 return table[o->object.type];
478}
479
78519831 480int journal_file_move_to_object(JournalFile *f, ObjectType type, uint64_t offset, Object **ret) {
cec736d2
LP
481 int r;
482 void *t;
483 Object *o;
484 uint64_t s;
485
486 assert(f);
487 assert(ret);
488
db11ac1a
LP
489 /* Objects may only be located at multiple of 64 bit */
490 if (!VALID64(offset))
491 return -EFAULT;
492
7a9dabea 493 r = journal_file_move_to(f, type, false, offset, sizeof(ObjectHeader), &t);
cec736d2
LP
494 if (r < 0)
495 return r;
496
497 o = (Object*) t;
498 s = le64toh(o->object.size);
499
500 if (s < sizeof(ObjectHeader))
501 return -EBADMSG;
502
16e9f408
LP
503 if (o->object.type <= OBJECT_UNUSED)
504 return -EBADMSG;
505
506 if (s < minimum_header_size(o))
507 return -EBADMSG;
508
d05089d8 509 if (type > OBJECT_UNUSED && o->object.type != type)
cec736d2
LP
510 return -EBADMSG;
511
512 if (s > sizeof(ObjectHeader)) {
7a9dabea 513 r = journal_file_move_to(f, type, false, offset, s, &t);
cec736d2
LP
514 if (r < 0)
515 return r;
516
517 o = (Object*) t;
518 }
519
cec736d2
LP
520 *ret = o;
521 return 0;
522}
523
d98cc1f2 524static uint64_t journal_file_entry_seqnum(JournalFile *f, uint64_t *seqnum) {
cec736d2
LP
525 uint64_t r;
526
527 assert(f);
528
beec0085 529 r = le64toh(f->header->tail_entry_seqnum) + 1;
c2373f84
LP
530
531 if (seqnum) {
de190aef 532 /* If an external seqnum counter was passed, we update
c2373f84
LP
533 * both the local and the external one, and set it to
534 * the maximum of both */
535
536 if (*seqnum + 1 > r)
537 r = *seqnum + 1;
538
539 *seqnum = r;
540 }
541
beec0085 542 f->header->tail_entry_seqnum = htole64(r);
cec736d2 543
beec0085
LP
544 if (f->header->head_entry_seqnum == 0)
545 f->header->head_entry_seqnum = htole64(r);
de190aef 546
cec736d2
LP
547 return r;
548}
549
78519831 550int journal_file_append_object(JournalFile *f, ObjectType type, uint64_t size, Object **ret, uint64_t *offset) {
cec736d2
LP
551 int r;
552 uint64_t p;
553 Object *tail, *o;
554 void *t;
555
556 assert(f);
d05089d8 557 assert(type > OBJECT_UNUSED && type < _OBJECT_TYPE_MAX);
cec736d2
LP
558 assert(size >= sizeof(ObjectHeader));
559 assert(offset);
560 assert(ret);
561
26687bf8
OS
562 r = journal_file_set_online(f);
563 if (r < 0)
564 return r;
565
cec736d2 566 p = le64toh(f->header->tail_object_offset);
cec736d2 567 if (p == 0)
23b0b2b2 568 p = le64toh(f->header->header_size);
cec736d2 569 else {
d05089d8 570 r = journal_file_move_to_object(f, OBJECT_UNUSED, p, &tail);
cec736d2
LP
571 if (r < 0)
572 return r;
573
574 p += ALIGN64(le64toh(tail->object.size));
575 }
576
577 r = journal_file_allocate(f, p, size);
578 if (r < 0)
579 return r;
580
fcde2389 581 r = journal_file_move_to(f, type, false, p, size, &t);
cec736d2
LP
582 if (r < 0)
583 return r;
584
585 o = (Object*) t;
586
587 zero(o->object);
de190aef 588 o->object.type = type;
cec736d2
LP
589 o->object.size = htole64(size);
590
591 f->header->tail_object_offset = htole64(p);
cec736d2
LP
592 f->header->n_objects = htole64(le64toh(f->header->n_objects) + 1);
593
594 *ret = o;
595 *offset = p;
596
597 return 0;
598}
599
de190aef 600static int journal_file_setup_data_hash_table(JournalFile *f) {
cec736d2
LP
601 uint64_t s, p;
602 Object *o;
603 int r;
604
605 assert(f);
606
dfabe643 607 /* We estimate that we need 1 hash table entry per 768 of
4a92baf3
LP
608 journal file and we want to make sure we never get beyond
609 75% fill level. Calculate the hash table size for the
610 maximum file size based on these metrics. */
611
dfabe643 612 s = (f->metrics.max_size * 4 / 768 / 3) * sizeof(HashItem);
4a92baf3
LP
613 if (s < DEFAULT_DATA_HASH_TABLE_SIZE)
614 s = DEFAULT_DATA_HASH_TABLE_SIZE;
615
507f22bd 616 log_debug("Reserving %"PRIu64" entries in hash table.", s / sizeof(HashItem));
4a92baf3 617
de190aef
LP
618 r = journal_file_append_object(f,
619 OBJECT_DATA_HASH_TABLE,
620 offsetof(Object, hash_table.items) + s,
621 &o, &p);
cec736d2
LP
622 if (r < 0)
623 return r;
624
29804cc1 625 memzero(o->hash_table.items, s);
cec736d2 626
de190aef
LP
627 f->header->data_hash_table_offset = htole64(p + offsetof(Object, hash_table.items));
628 f->header->data_hash_table_size = htole64(s);
cec736d2
LP
629
630 return 0;
631}
632
de190aef 633static int journal_file_setup_field_hash_table(JournalFile *f) {
cec736d2
LP
634 uint64_t s, p;
635 Object *o;
636 int r;
637
638 assert(f);
639
3c1668da
LP
640 /* We use a fixed size hash table for the fields as this
641 * number should grow very slowly only */
642
de190aef
LP
643 s = DEFAULT_FIELD_HASH_TABLE_SIZE;
644 r = journal_file_append_object(f,
645 OBJECT_FIELD_HASH_TABLE,
646 offsetof(Object, hash_table.items) + s,
647 &o, &p);
cec736d2
LP
648 if (r < 0)
649 return r;
650
29804cc1 651 memzero(o->hash_table.items, s);
cec736d2 652
de190aef
LP
653 f->header->field_hash_table_offset = htole64(p + offsetof(Object, hash_table.items));
654 f->header->field_hash_table_size = htole64(s);
cec736d2
LP
655
656 return 0;
657}
658
dade37d4 659int journal_file_map_data_hash_table(JournalFile *f) {
cec736d2
LP
660 uint64_t s, p;
661 void *t;
662 int r;
663
664 assert(f);
665
dade37d4
LP
666 if (f->data_hash_table)
667 return 0;
668
de190aef
LP
669 p = le64toh(f->header->data_hash_table_offset);
670 s = le64toh(f->header->data_hash_table_size);
cec736d2 671
de190aef 672 r = journal_file_move_to(f,
16e9f408 673 OBJECT_DATA_HASH_TABLE,
fcde2389 674 true,
de190aef
LP
675 p, s,
676 &t);
cec736d2
LP
677 if (r < 0)
678 return r;
679
de190aef 680 f->data_hash_table = t;
cec736d2
LP
681 return 0;
682}
683
dade37d4 684int journal_file_map_field_hash_table(JournalFile *f) {
cec736d2
LP
685 uint64_t s, p;
686 void *t;
687 int r;
688
689 assert(f);
690
dade37d4
LP
691 if (f->field_hash_table)
692 return 0;
693
de190aef
LP
694 p = le64toh(f->header->field_hash_table_offset);
695 s = le64toh(f->header->field_hash_table_size);
cec736d2 696
de190aef 697 r = journal_file_move_to(f,
16e9f408 698 OBJECT_FIELD_HASH_TABLE,
fcde2389 699 true,
de190aef
LP
700 p, s,
701 &t);
cec736d2
LP
702 if (r < 0)
703 return r;
704
de190aef 705 f->field_hash_table = t;
cec736d2
LP
706 return 0;
707}
708
3c1668da
LP
709static int journal_file_link_field(
710 JournalFile *f,
711 Object *o,
712 uint64_t offset,
713 uint64_t hash) {
714
805d1486 715 uint64_t p, h, m;
3c1668da
LP
716 int r;
717
718 assert(f);
719 assert(o);
720 assert(offset > 0);
721
722 if (o->object.type != OBJECT_FIELD)
723 return -EINVAL;
724
805d1486
LP
725 m = le64toh(f->header->field_hash_table_size) / sizeof(HashItem);
726 if (m <= 0)
727 return -EBADMSG;
3c1668da 728
805d1486 729 /* This might alter the window we are looking at */
3c1668da
LP
730 o->field.next_hash_offset = o->field.head_data_offset = 0;
731
805d1486 732 h = hash % m;
3c1668da
LP
733 p = le64toh(f->field_hash_table[h].tail_hash_offset);
734 if (p == 0)
735 f->field_hash_table[h].head_hash_offset = htole64(offset);
736 else {
737 r = journal_file_move_to_object(f, OBJECT_FIELD, p, &o);
738 if (r < 0)
739 return r;
740
741 o->field.next_hash_offset = htole64(offset);
742 }
743
744 f->field_hash_table[h].tail_hash_offset = htole64(offset);
745
746 if (JOURNAL_HEADER_CONTAINS(f->header, n_fields))
747 f->header->n_fields = htole64(le64toh(f->header->n_fields) + 1);
748
749 return 0;
750}
751
752static int journal_file_link_data(
753 JournalFile *f,
754 Object *o,
755 uint64_t offset,
756 uint64_t hash) {
757
805d1486 758 uint64_t p, h, m;
cec736d2
LP
759 int r;
760
761 assert(f);
762 assert(o);
763 assert(offset > 0);
b588975f
LP
764
765 if (o->object.type != OBJECT_DATA)
766 return -EINVAL;
cec736d2 767
805d1486
LP
768 m = le64toh(f->header->data_hash_table_size) / sizeof(HashItem);
769 if (m <= 0)
770 return -EBADMSG;
48496df6 771
805d1486 772 /* This might alter the window we are looking at */
de190aef
LP
773 o->data.next_hash_offset = o->data.next_field_offset = 0;
774 o->data.entry_offset = o->data.entry_array_offset = 0;
775 o->data.n_entries = 0;
cec736d2 776
805d1486 777 h = hash % m;
8db4213e 778 p = le64toh(f->data_hash_table[h].tail_hash_offset);
3c1668da 779 if (p == 0)
cec736d2 780 /* Only entry in the hash table is easy */
de190aef 781 f->data_hash_table[h].head_hash_offset = htole64(offset);
3c1668da 782 else {
48496df6
LP
783 /* Move back to the previous data object, to patch in
784 * pointer */
cec736d2 785
de190aef 786 r = journal_file_move_to_object(f, OBJECT_DATA, p, &o);
cec736d2
LP
787 if (r < 0)
788 return r;
789
de190aef 790 o->data.next_hash_offset = htole64(offset);
cec736d2
LP
791 }
792
de190aef 793 f->data_hash_table[h].tail_hash_offset = htole64(offset);
cec736d2 794
dca6219e
LP
795 if (JOURNAL_HEADER_CONTAINS(f->header, n_data))
796 f->header->n_data = htole64(le64toh(f->header->n_data) + 1);
797
cec736d2
LP
798 return 0;
799}
800
3c1668da
LP
801int journal_file_find_field_object_with_hash(
802 JournalFile *f,
803 const void *field, uint64_t size, uint64_t hash,
804 Object **ret, uint64_t *offset) {
805
805d1486 806 uint64_t p, osize, h, m;
3c1668da
LP
807 int r;
808
809 assert(f);
810 assert(field && size > 0);
811
dade37d4
LP
812 /* If the field hash table is empty, we can't find anything */
813 if (le64toh(f->header->field_hash_table_size) <= 0)
814 return 0;
815
816 /* Map the field hash table, if it isn't mapped yet. */
817 r = journal_file_map_field_hash_table(f);
818 if (r < 0)
819 return r;
820
3c1668da
LP
821 osize = offsetof(Object, field.payload) + size;
822
805d1486 823 m = le64toh(f->header->field_hash_table_size) / sizeof(HashItem);
805d1486 824 if (m <= 0)
3c1668da
LP
825 return -EBADMSG;
826
805d1486 827 h = hash % m;
3c1668da
LP
828 p = le64toh(f->field_hash_table[h].head_hash_offset);
829
830 while (p > 0) {
831 Object *o;
832
833 r = journal_file_move_to_object(f, OBJECT_FIELD, p, &o);
834 if (r < 0)
835 return r;
836
837 if (le64toh(o->field.hash) == hash &&
838 le64toh(o->object.size) == osize &&
839 memcmp(o->field.payload, field, size) == 0) {
840
841 if (ret)
842 *ret = o;
843 if (offset)
844 *offset = p;
845
846 return 1;
847 }
848
849 p = le64toh(o->field.next_hash_offset);
850 }
851
852 return 0;
853}
854
855int journal_file_find_field_object(
856 JournalFile *f,
857 const void *field, uint64_t size,
858 Object **ret, uint64_t *offset) {
859
860 uint64_t hash;
861
862 assert(f);
863 assert(field && size > 0);
864
865 hash = hash64(field, size);
866
867 return journal_file_find_field_object_with_hash(f,
868 field, size, hash,
869 ret, offset);
870}
871
de190aef
LP
872int journal_file_find_data_object_with_hash(
873 JournalFile *f,
874 const void *data, uint64_t size, uint64_t hash,
875 Object **ret, uint64_t *offset) {
48496df6 876
805d1486 877 uint64_t p, osize, h, m;
cec736d2
LP
878 int r;
879
880 assert(f);
881 assert(data || size == 0);
882
dade37d4
LP
883 /* If there's no data hash table, then there's no entry. */
884 if (le64toh(f->header->data_hash_table_size) <= 0)
885 return 0;
886
887 /* Map the data hash table, if it isn't mapped yet. */
888 r = journal_file_map_data_hash_table(f);
889 if (r < 0)
890 return r;
891
cec736d2
LP
892 osize = offsetof(Object, data.payload) + size;
893
805d1486
LP
894 m = le64toh(f->header->data_hash_table_size) / sizeof(HashItem);
895 if (m <= 0)
bc85bfee
LP
896 return -EBADMSG;
897
805d1486 898 h = hash % m;
de190aef 899 p = le64toh(f->data_hash_table[h].head_hash_offset);
cec736d2 900
de190aef
LP
901 while (p > 0) {
902 Object *o;
cec736d2 903
de190aef 904 r = journal_file_move_to_object(f, OBJECT_DATA, p, &o);
cec736d2
LP
905 if (r < 0)
906 return r;
907
807e17f0 908 if (le64toh(o->data.hash) != hash)
85a131e8 909 goto next;
807e17f0 910
d89c8fdf 911 if (o->object.flags & OBJECT_COMPRESSION_MASK) {
3b1a55e1 912#if defined(HAVE_XZ) || defined(HAVE_LZ4)
fa1c4b51 913 uint64_t l;
a7f7d1bd 914 size_t rsize = 0;
cec736d2 915
807e17f0
LP
916 l = le64toh(o->object.size);
917 if (l <= offsetof(Object, data.payload))
cec736d2
LP
918 return -EBADMSG;
919
807e17f0
LP
920 l -= offsetof(Object, data.payload);
921
d89c8fdf
ZJS
922 r = decompress_blob(o->object.flags & OBJECT_COMPRESSION_MASK,
923 o->data.payload, l, &f->compress_buffer, &f->compress_buffer_size, &rsize, 0);
924 if (r < 0)
925 return r;
807e17f0 926
b785c858 927 if (rsize == size &&
807e17f0
LP
928 memcmp(f->compress_buffer, data, size) == 0) {
929
930 if (ret)
931 *ret = o;
932
933 if (offset)
934 *offset = p;
935
936 return 1;
937 }
3b1a55e1
ZJS
938#else
939 return -EPROTONOSUPPORT;
940#endif
807e17f0
LP
941 } else if (le64toh(o->object.size) == osize &&
942 memcmp(o->data.payload, data, size) == 0) {
943
cec736d2
LP
944 if (ret)
945 *ret = o;
946
947 if (offset)
948 *offset = p;
949
de190aef 950 return 1;
cec736d2
LP
951 }
952
85a131e8 953 next:
cec736d2
LP
954 p = le64toh(o->data.next_hash_offset);
955 }
956
de190aef
LP
957 return 0;
958}
959
960int journal_file_find_data_object(
961 JournalFile *f,
962 const void *data, uint64_t size,
963 Object **ret, uint64_t *offset) {
964
965 uint64_t hash;
966
967 assert(f);
968 assert(data || size == 0);
969
970 hash = hash64(data, size);
971
972 return journal_file_find_data_object_with_hash(f,
973 data, size, hash,
974 ret, offset);
975}
976
3c1668da
LP
977static int journal_file_append_field(
978 JournalFile *f,
979 const void *field, uint64_t size,
980 Object **ret, uint64_t *offset) {
981
982 uint64_t hash, p;
983 uint64_t osize;
984 Object *o;
985 int r;
986
987 assert(f);
988 assert(field && size > 0);
989
990 hash = hash64(field, size);
991
992 r = journal_file_find_field_object_with_hash(f, field, size, hash, &o, &p);
993 if (r < 0)
994 return r;
995 else if (r > 0) {
996
997 if (ret)
998 *ret = o;
999
1000 if (offset)
1001 *offset = p;
1002
1003 return 0;
1004 }
1005
1006 osize = offsetof(Object, field.payload) + size;
1007 r = journal_file_append_object(f, OBJECT_FIELD, osize, &o, &p);
8c92d4bb
LP
1008 if (r < 0)
1009 return r;
3c1668da
LP
1010
1011 o->field.hash = htole64(hash);
1012 memcpy(o->field.payload, field, size);
1013
1014 r = journal_file_link_field(f, o, p, hash);
1015 if (r < 0)
1016 return r;
1017
1018 /* The linking might have altered the window, so let's
1019 * refresh our pointer */
1020 r = journal_file_move_to_object(f, OBJECT_FIELD, p, &o);
1021 if (r < 0)
1022 return r;
1023
1024#ifdef HAVE_GCRYPT
1025 r = journal_file_hmac_put_object(f, OBJECT_FIELD, o, p);
1026 if (r < 0)
1027 return r;
1028#endif
1029
1030 if (ret)
1031 *ret = o;
1032
1033 if (offset)
1034 *offset = p;
1035
1036 return 0;
1037}
1038
48496df6
LP
1039static int journal_file_append_data(
1040 JournalFile *f,
1041 const void *data, uint64_t size,
1042 Object **ret, uint64_t *offset) {
1043
de190aef
LP
1044 uint64_t hash, p;
1045 uint64_t osize;
1046 Object *o;
d89c8fdf 1047 int r, compression = 0;
3c1668da 1048 const void *eq;
de190aef
LP
1049
1050 assert(f);
1051 assert(data || size == 0);
1052
1053 hash = hash64(data, size);
1054
1055 r = journal_file_find_data_object_with_hash(f, data, size, hash, &o, &p);
1056 if (r < 0)
1057 return r;
1058 else if (r > 0) {
1059
1060 if (ret)
1061 *ret = o;
1062
1063 if (offset)
1064 *offset = p;
1065
1066 return 0;
1067 }
1068
1069 osize = offsetof(Object, data.payload) + size;
1070 r = journal_file_append_object(f, OBJECT_DATA, osize, &o, &p);
cec736d2
LP
1071 if (r < 0)
1072 return r;
1073
cec736d2 1074 o->data.hash = htole64(hash);
807e17f0 1075
d89c8fdf
ZJS
1076#if defined(HAVE_XZ) || defined(HAVE_LZ4)
1077 if (f->compress_xz &&
807e17f0 1078 size >= COMPRESSION_SIZE_THRESHOLD) {
a7f7d1bd 1079 size_t rsize = 0;
807e17f0 1080
d89c8fdf 1081 compression = compress_blob(data, size, o->data.payload, &rsize);
807e17f0 1082
d89c8fdf 1083 if (compression) {
807e17f0 1084 o->object.size = htole64(offsetof(Object, data.payload) + rsize);
d89c8fdf 1085 o->object.flags |= compression;
807e17f0 1086
fa1c4b51 1087 log_debug("Compressed data object %"PRIu64" -> %zu using %s",
d89c8fdf 1088 size, rsize, object_compressed_to_string(compression));
807e17f0
LP
1089 }
1090 }
1091#endif
1092
d89c8fdf 1093 if (!compression && size > 0)
807e17f0 1094 memcpy(o->data.payload, data, size);
cec736d2 1095
de190aef 1096 r = journal_file_link_data(f, o, p, hash);
cec736d2
LP
1097 if (r < 0)
1098 return r;
1099
48496df6
LP
1100 /* The linking might have altered the window, so let's
1101 * refresh our pointer */
1102 r = journal_file_move_to_object(f, OBJECT_DATA, p, &o);
1103 if (r < 0)
1104 return r;
1105
08c6f819
SL
1106 if (!data)
1107 eq = NULL;
1108 else
1109 eq = memchr(data, '=', size);
3c1668da 1110 if (eq && eq > data) {
748db592 1111 Object *fo = NULL;
3c1668da 1112 uint64_t fp;
3c1668da
LP
1113
1114 /* Create field object ... */
1115 r = journal_file_append_field(f, data, (uint8_t*) eq - (uint8_t*) data, &fo, &fp);
1116 if (r < 0)
1117 return r;
1118
1119 /* ... and link it in. */
1120 o->data.next_field_offset = fo->field.head_data_offset;
1121 fo->field.head_data_offset = le64toh(p);
1122 }
1123
5996c7c2
LP
1124#ifdef HAVE_GCRYPT
1125 r = journal_file_hmac_put_object(f, OBJECT_DATA, o, p);
1126 if (r < 0)
1127 return r;
1128#endif
1129
cec736d2
LP
1130 if (ret)
1131 *ret = o;
1132
1133 if (offset)
de190aef 1134 *offset = p;
cec736d2
LP
1135
1136 return 0;
1137}
1138
1139uint64_t journal_file_entry_n_items(Object *o) {
1140 assert(o);
b588975f
LP
1141
1142 if (o->object.type != OBJECT_ENTRY)
1143 return 0;
cec736d2
LP
1144
1145 return (le64toh(o->object.size) - offsetof(Object, entry.items)) / sizeof(EntryItem);
1146}
1147
0284adc6 1148uint64_t journal_file_entry_array_n_items(Object *o) {
de190aef 1149 assert(o);
b588975f
LP
1150
1151 if (o->object.type != OBJECT_ENTRY_ARRAY)
1152 return 0;
de190aef
LP
1153
1154 return (le64toh(o->object.size) - offsetof(Object, entry_array.items)) / sizeof(uint64_t);
1155}
1156
fb9a24b6
LP
1157uint64_t journal_file_hash_table_n_items(Object *o) {
1158 assert(o);
b588975f
LP
1159
1160 if (o->object.type != OBJECT_DATA_HASH_TABLE &&
1161 o->object.type != OBJECT_FIELD_HASH_TABLE)
1162 return 0;
fb9a24b6
LP
1163
1164 return (le64toh(o->object.size) - offsetof(Object, hash_table.items)) / sizeof(HashItem);
1165}
1166
de190aef 1167static int link_entry_into_array(JournalFile *f,
4fd052ae
FC
1168 le64_t *first,
1169 le64_t *idx,
de190aef 1170 uint64_t p) {
cec736d2 1171 int r;
de190aef
LP
1172 uint64_t n = 0, ap = 0, q, i, a, hidx;
1173 Object *o;
1174
cec736d2 1175 assert(f);
de190aef
LP
1176 assert(first);
1177 assert(idx);
1178 assert(p > 0);
cec736d2 1179
de190aef
LP
1180 a = le64toh(*first);
1181 i = hidx = le64toh(*idx);
1182 while (a > 0) {
1183
1184 r = journal_file_move_to_object(f, OBJECT_ENTRY_ARRAY, a, &o);
1185 if (r < 0)
1186 return r;
cec736d2 1187
de190aef
LP
1188 n = journal_file_entry_array_n_items(o);
1189 if (i < n) {
1190 o->entry_array.items[i] = htole64(p);
1191 *idx = htole64(hidx + 1);
1192 return 0;
1193 }
cec736d2 1194
de190aef
LP
1195 i -= n;
1196 ap = a;
1197 a = le64toh(o->entry_array.next_entry_array_offset);
1198 }
1199
1200 if (hidx > n)
1201 n = (hidx+1) * 2;
1202 else
1203 n = n * 2;
1204
1205 if (n < 4)
1206 n = 4;
1207
1208 r = journal_file_append_object(f, OBJECT_ENTRY_ARRAY,
1209 offsetof(Object, entry_array.items) + n * sizeof(uint64_t),
1210 &o, &q);
cec736d2
LP
1211 if (r < 0)
1212 return r;
1213
feb12d3e 1214#ifdef HAVE_GCRYPT
5996c7c2 1215 r = journal_file_hmac_put_object(f, OBJECT_ENTRY_ARRAY, o, q);
b0af6f41
LP
1216 if (r < 0)
1217 return r;
feb12d3e 1218#endif
b0af6f41 1219
de190aef 1220 o->entry_array.items[i] = htole64(p);
cec736d2 1221
de190aef 1222 if (ap == 0)
7be3aa17 1223 *first = htole64(q);
cec736d2 1224 else {
de190aef 1225 r = journal_file_move_to_object(f, OBJECT_ENTRY_ARRAY, ap, &o);
cec736d2
LP
1226 if (r < 0)
1227 return r;
1228
de190aef
LP
1229 o->entry_array.next_entry_array_offset = htole64(q);
1230 }
cec736d2 1231
2dee23eb
LP
1232 if (JOURNAL_HEADER_CONTAINS(f->header, n_entry_arrays))
1233 f->header->n_entry_arrays = htole64(le64toh(f->header->n_entry_arrays) + 1);
1234
de190aef
LP
1235 *idx = htole64(hidx + 1);
1236
1237 return 0;
1238}
cec736d2 1239
de190aef 1240static int link_entry_into_array_plus_one(JournalFile *f,
4fd052ae
FC
1241 le64_t *extra,
1242 le64_t *first,
1243 le64_t *idx,
de190aef
LP
1244 uint64_t p) {
1245
1246 int r;
1247
1248 assert(f);
1249 assert(extra);
1250 assert(first);
1251 assert(idx);
1252 assert(p > 0);
1253
1254 if (*idx == 0)
1255 *extra = htole64(p);
1256 else {
4fd052ae 1257 le64_t i;
de190aef 1258
7be3aa17 1259 i = htole64(le64toh(*idx) - 1);
de190aef
LP
1260 r = link_entry_into_array(f, first, &i, p);
1261 if (r < 0)
1262 return r;
cec736d2
LP
1263 }
1264
de190aef
LP
1265 *idx = htole64(le64toh(*idx) + 1);
1266 return 0;
1267}
1268
1269static int journal_file_link_entry_item(JournalFile *f, Object *o, uint64_t offset, uint64_t i) {
1270 uint64_t p;
1271 int r;
1272 assert(f);
1273 assert(o);
1274 assert(offset > 0);
1275
1276 p = le64toh(o->entry.items[i].object_offset);
1277 if (p == 0)
1278 return -EINVAL;
1279
1280 r = journal_file_move_to_object(f, OBJECT_DATA, p, &o);
cec736d2
LP
1281 if (r < 0)
1282 return r;
1283
de190aef
LP
1284 return link_entry_into_array_plus_one(f,
1285 &o->data.entry_offset,
1286 &o->data.entry_array_offset,
1287 &o->data.n_entries,
1288 offset);
cec736d2
LP
1289}
1290
1291static int journal_file_link_entry(JournalFile *f, Object *o, uint64_t offset) {
de190aef 1292 uint64_t n, i;
cec736d2
LP
1293 int r;
1294
1295 assert(f);
1296 assert(o);
1297 assert(offset > 0);
b588975f
LP
1298
1299 if (o->object.type != OBJECT_ENTRY)
1300 return -EINVAL;
cec736d2 1301
b788cc23
LP
1302 __sync_synchronize();
1303
cec736d2 1304 /* Link up the entry itself */
de190aef
LP
1305 r = link_entry_into_array(f,
1306 &f->header->entry_array_offset,
1307 &f->header->n_entries,
1308 offset);
1309 if (r < 0)
1310 return r;
cec736d2 1311
507f22bd 1312 /* log_debug("=> %s seqnr=%"PRIu64" n_entries=%"PRIu64, f->path, o->entry.seqnum, f->header->n_entries); */
cec736d2 1313
de190aef 1314 if (f->header->head_entry_realtime == 0)
0ac38b70 1315 f->header->head_entry_realtime = o->entry.realtime;
cec736d2 1316
0ac38b70 1317 f->header->tail_entry_realtime = o->entry.realtime;
de190aef
LP
1318 f->header->tail_entry_monotonic = o->entry.monotonic;
1319
1320 f->tail_entry_monotonic_valid = true;
cec736d2
LP
1321
1322 /* Link up the items */
1323 n = journal_file_entry_n_items(o);
1324 for (i = 0; i < n; i++) {
1325 r = journal_file_link_entry_item(f, o, offset, i);
1326 if (r < 0)
1327 return r;
1328 }
1329
cec736d2
LP
1330 return 0;
1331}
1332
1333static int journal_file_append_entry_internal(
1334 JournalFile *f,
1335 const dual_timestamp *ts,
1336 uint64_t xor_hash,
1337 const EntryItem items[], unsigned n_items,
de190aef 1338 uint64_t *seqnum,
cec736d2
LP
1339 Object **ret, uint64_t *offset) {
1340 uint64_t np;
1341 uint64_t osize;
1342 Object *o;
1343 int r;
1344
1345 assert(f);
1346 assert(items || n_items == 0);
de190aef 1347 assert(ts);
cec736d2
LP
1348
1349 osize = offsetof(Object, entry.items) + (n_items * sizeof(EntryItem));
1350
de190aef 1351 r = journal_file_append_object(f, OBJECT_ENTRY, osize, &o, &np);
cec736d2
LP
1352 if (r < 0)
1353 return r;
1354
d98cc1f2 1355 o->entry.seqnum = htole64(journal_file_entry_seqnum(f, seqnum));
cec736d2 1356 memcpy(o->entry.items, items, n_items * sizeof(EntryItem));
de190aef
LP
1357 o->entry.realtime = htole64(ts->realtime);
1358 o->entry.monotonic = htole64(ts->monotonic);
cec736d2
LP
1359 o->entry.xor_hash = htole64(xor_hash);
1360 o->entry.boot_id = f->header->boot_id;
1361
feb12d3e 1362#ifdef HAVE_GCRYPT
5996c7c2 1363 r = journal_file_hmac_put_object(f, OBJECT_ENTRY, o, np);
b0af6f41
LP
1364 if (r < 0)
1365 return r;
feb12d3e 1366#endif
b0af6f41 1367
cec736d2
LP
1368 r = journal_file_link_entry(f, o, np);
1369 if (r < 0)
1370 return r;
1371
1372 if (ret)
1373 *ret = o;
1374
1375 if (offset)
1376 *offset = np;
1377
1378 return 0;
1379}
1380
cf244689 1381void journal_file_post_change(JournalFile *f) {
50f20cfd
LP
1382 assert(f);
1383
1384 /* inotify() does not receive IN_MODIFY events from file
1385 * accesses done via mmap(). After each access we hence
1386 * trigger IN_MODIFY by truncating the journal file to its
1387 * current size which triggers IN_MODIFY. */
1388
bc85bfee
LP
1389 __sync_synchronize();
1390
50f20cfd 1391 if (ftruncate(f->fd, f->last_stat.st_size) < 0)
56f64d95 1392 log_error_errno(errno, "Failed to truncate file to its own size: %m");
50f20cfd
LP
1393}
1394
1f2da9ec
LP
1395static int entry_item_cmp(const void *_a, const void *_b) {
1396 const EntryItem *a = _a, *b = _b;
1397
1398 if (le64toh(a->object_offset) < le64toh(b->object_offset))
1399 return -1;
1400 if (le64toh(a->object_offset) > le64toh(b->object_offset))
1401 return 1;
1402 return 0;
1403}
1404
de190aef 1405int journal_file_append_entry(JournalFile *f, const dual_timestamp *ts, const struct iovec iovec[], unsigned n_iovec, uint64_t *seqnum, Object **ret, uint64_t *offset) {
cec736d2
LP
1406 unsigned i;
1407 EntryItem *items;
1408 int r;
1409 uint64_t xor_hash = 0;
de190aef 1410 struct dual_timestamp _ts;
cec736d2
LP
1411
1412 assert(f);
1413 assert(iovec || n_iovec == 0);
1414
de190aef
LP
1415 if (!ts) {
1416 dual_timestamp_get(&_ts);
1417 ts = &_ts;
1418 }
1419
1420 if (f->tail_entry_monotonic_valid &&
1421 ts->monotonic < le64toh(f->header->tail_entry_monotonic))
1422 return -EINVAL;
1423
feb12d3e 1424#ifdef HAVE_GCRYPT
7560fffc
LP
1425 r = journal_file_maybe_append_tag(f, ts->realtime);
1426 if (r < 0)
1427 return r;
feb12d3e 1428#endif
7560fffc 1429
64825d3c 1430 /* alloca() can't take 0, hence let's allocate at least one */
9607d947 1431 items = alloca(sizeof(EntryItem) * MAX(1u, n_iovec));
cec736d2
LP
1432
1433 for (i = 0; i < n_iovec; i++) {
1434 uint64_t p;
1435 Object *o;
1436
1437 r = journal_file_append_data(f, iovec[i].iov_base, iovec[i].iov_len, &o, &p);
1438 if (r < 0)
cf244689 1439 return r;
cec736d2
LP
1440
1441 xor_hash ^= le64toh(o->data.hash);
1442 items[i].object_offset = htole64(p);
de7b95cd 1443 items[i].hash = o->data.hash;
cec736d2
LP
1444 }
1445
1f2da9ec
LP
1446 /* Order by the position on disk, in order to improve seek
1447 * times for rotating media. */
7ff7394d 1448 qsort_safe(items, n_iovec, sizeof(EntryItem), entry_item_cmp);
1f2da9ec 1449
de190aef 1450 r = journal_file_append_entry_internal(f, ts, xor_hash, items, n_iovec, seqnum, ret, offset);
cec736d2 1451
fa6ac760
LP
1452 /* If the memory mapping triggered a SIGBUS then we return an
1453 * IO error and ignore the error code passed down to us, since
1454 * it is very likely just an effect of a nullified replacement
1455 * mapping page */
1456
1457 if (mmap_cache_got_sigbus(f->mmap, f->fd))
1458 r = -EIO;
1459
50f20cfd
LP
1460 journal_file_post_change(f);
1461
cec736d2
LP
1462 return r;
1463}
1464
a4bcff5b 1465typedef struct ChainCacheItem {
fb099c8d 1466 uint64_t first; /* the array at the beginning of the chain */
a4bcff5b
LP
1467 uint64_t array; /* the cached array */
1468 uint64_t begin; /* the first item in the cached array */
1469 uint64_t total; /* the total number of items in all arrays before this one in the chain */
f268980d 1470 uint64_t last_index; /* the last index we looked at, to optimize locality when bisecting */
a4bcff5b
LP
1471} ChainCacheItem;
1472
1473static void chain_cache_put(
4743015d 1474 OrderedHashmap *h,
a4bcff5b
LP
1475 ChainCacheItem *ci,
1476 uint64_t first,
1477 uint64_t array,
1478 uint64_t begin,
f268980d
LP
1479 uint64_t total,
1480 uint64_t last_index) {
a4bcff5b
LP
1481
1482 if (!ci) {
34741aa3
LP
1483 /* If the chain item to cache for this chain is the
1484 * first one it's not worth caching anything */
1485 if (array == first)
1486 return;
1487
29433089 1488 if (ordered_hashmap_size(h) >= CHAIN_CACHE_MAX) {
4743015d 1489 ci = ordered_hashmap_steal_first(h);
29433089
LP
1490 assert(ci);
1491 } else {
a4bcff5b
LP
1492 ci = new(ChainCacheItem, 1);
1493 if (!ci)
1494 return;
1495 }
1496
1497 ci->first = first;
1498
4743015d 1499 if (ordered_hashmap_put(h, &ci->first, ci) < 0) {
a4bcff5b
LP
1500 free(ci);
1501 return;
1502 }
1503 } else
1504 assert(ci->first == first);
1505
1506 ci->array = array;
1507 ci->begin = begin;
1508 ci->total = total;
f268980d 1509 ci->last_index = last_index;
a4bcff5b
LP
1510}
1511
f268980d
LP
1512static int generic_array_get(
1513 JournalFile *f,
1514 uint64_t first,
1515 uint64_t i,
1516 Object **ret, uint64_t *offset) {
de190aef 1517
cec736d2 1518 Object *o;
a4bcff5b 1519 uint64_t p = 0, a, t = 0;
cec736d2 1520 int r;
a4bcff5b 1521 ChainCacheItem *ci;
cec736d2
LP
1522
1523 assert(f);
1524
de190aef 1525 a = first;
a4bcff5b
LP
1526
1527 /* Try the chain cache first */
4743015d 1528 ci = ordered_hashmap_get(f->chain_cache, &first);
a4bcff5b
LP
1529 if (ci && i > ci->total) {
1530 a = ci->array;
1531 i -= ci->total;
1532 t = ci->total;
1533 }
1534
de190aef 1535 while (a > 0) {
a4bcff5b 1536 uint64_t k;
cec736d2 1537
de190aef
LP
1538 r = journal_file_move_to_object(f, OBJECT_ENTRY_ARRAY, a, &o);
1539 if (r < 0)
1540 return r;
cec736d2 1541
a4bcff5b
LP
1542 k = journal_file_entry_array_n_items(o);
1543 if (i < k) {
de190aef 1544 p = le64toh(o->entry_array.items[i]);
a4bcff5b 1545 goto found;
cec736d2
LP
1546 }
1547
a4bcff5b
LP
1548 i -= k;
1549 t += k;
de190aef
LP
1550 a = le64toh(o->entry_array.next_entry_array_offset);
1551 }
1552
a4bcff5b
LP
1553 return 0;
1554
1555found:
1556 /* Let's cache this item for the next invocation */
af13a6b0 1557 chain_cache_put(f->chain_cache, ci, first, a, le64toh(o->entry_array.items[0]), t, i);
de190aef
LP
1558
1559 r = journal_file_move_to_object(f, OBJECT_ENTRY, p, &o);
1560 if (r < 0)
1561 return r;
1562
1563 if (ret)
1564 *ret = o;
1565
1566 if (offset)
1567 *offset = p;
1568
1569 return 1;
1570}
1571
f268980d
LP
1572static int generic_array_get_plus_one(
1573 JournalFile *f,
1574 uint64_t extra,
1575 uint64_t first,
1576 uint64_t i,
1577 Object **ret, uint64_t *offset) {
de190aef
LP
1578
1579 Object *o;
1580
1581 assert(f);
1582
1583 if (i == 0) {
1584 int r;
1585
1586 r = journal_file_move_to_object(f, OBJECT_ENTRY, extra, &o);
cec736d2
LP
1587 if (r < 0)
1588 return r;
1589
de190aef
LP
1590 if (ret)
1591 *ret = o;
cec736d2 1592
de190aef
LP
1593 if (offset)
1594 *offset = extra;
cec736d2 1595
de190aef 1596 return 1;
cec736d2
LP
1597 }
1598
de190aef
LP
1599 return generic_array_get(f, first, i-1, ret, offset);
1600}
cec736d2 1601
de190aef
LP
1602enum {
1603 TEST_FOUND,
1604 TEST_LEFT,
1605 TEST_RIGHT
1606};
cec736d2 1607
f268980d
LP
1608static int generic_array_bisect(
1609 JournalFile *f,
1610 uint64_t first,
1611 uint64_t n,
1612 uint64_t needle,
1613 int (*test_object)(JournalFile *f, uint64_t p, uint64_t needle),
1614 direction_t direction,
1615 Object **ret,
1616 uint64_t *offset,
1617 uint64_t *idx) {
1618
1619 uint64_t a, p, t = 0, i = 0, last_p = 0, last_index = (uint64_t) -1;
de190aef
LP
1620 bool subtract_one = false;
1621 Object *o, *array = NULL;
1622 int r;
a4bcff5b 1623 ChainCacheItem *ci;
cec736d2 1624
de190aef
LP
1625 assert(f);
1626 assert(test_object);
cec736d2 1627
a4bcff5b 1628 /* Start with the first array in the chain */
de190aef 1629 a = first;
a4bcff5b 1630
4743015d 1631 ci = ordered_hashmap_get(f->chain_cache, &first);
a4bcff5b
LP
1632 if (ci && n > ci->total) {
1633 /* Ah, we have iterated this bisection array chain
1634 * previously! Let's see if we can skip ahead in the
1635 * chain, as far as the last time. But we can't jump
1636 * backwards in the chain, so let's check that
1637 * first. */
1638
1639 r = test_object(f, ci->begin, needle);
1640 if (r < 0)
1641 return r;
1642
1643 if (r == TEST_LEFT) {
f268980d 1644 /* OK, what we are looking for is right of the
a4bcff5b
LP
1645 * begin of this EntryArray, so let's jump
1646 * straight to previously cached array in the
1647 * chain */
1648
1649 a = ci->array;
1650 n -= ci->total;
1651 t = ci->total;
f268980d 1652 last_index = ci->last_index;
a4bcff5b
LP
1653 }
1654 }
1655
de190aef
LP
1656 while (a > 0) {
1657 uint64_t left, right, k, lp;
1658
1659 r = journal_file_move_to_object(f, OBJECT_ENTRY_ARRAY, a, &array);
cec736d2
LP
1660 if (r < 0)
1661 return r;
1662
de190aef
LP
1663 k = journal_file_entry_array_n_items(array);
1664 right = MIN(k, n);
1665 if (right <= 0)
1666 return 0;
cec736d2 1667
de190aef
LP
1668 i = right - 1;
1669 lp = p = le64toh(array->entry_array.items[i]);
1670 if (p <= 0)
1671 return -EBADMSG;
cec736d2 1672
de190aef
LP
1673 r = test_object(f, p, needle);
1674 if (r < 0)
1675 return r;
cec736d2 1676
de190aef
LP
1677 if (r == TEST_FOUND)
1678 r = direction == DIRECTION_DOWN ? TEST_RIGHT : TEST_LEFT;
1679
1680 if (r == TEST_RIGHT) {
1681 left = 0;
1682 right -= 1;
f268980d
LP
1683
1684 if (last_index != (uint64_t) -1) {
1685 assert(last_index <= right);
1686
1687 /* If we cached the last index we
1688 * looked at, let's try to not to jump
1689 * too wildly around and see if we can
1690 * limit the range to look at early to
1691 * the immediate neighbors of the last
1692 * index we looked at. */
1693
1694 if (last_index > 0) {
1695 uint64_t x = last_index - 1;
1696
1697 p = le64toh(array->entry_array.items[x]);
1698 if (p <= 0)
1699 return -EBADMSG;
1700
1701 r = test_object(f, p, needle);
1702 if (r < 0)
1703 return r;
1704
1705 if (r == TEST_FOUND)
1706 r = direction == DIRECTION_DOWN ? TEST_RIGHT : TEST_LEFT;
1707
1708 if (r == TEST_RIGHT)
1709 right = x;
1710 else
1711 left = x + 1;
1712 }
1713
1714 if (last_index < right) {
1715 uint64_t y = last_index + 1;
1716
1717 p = le64toh(array->entry_array.items[y]);
1718 if (p <= 0)
1719 return -EBADMSG;
1720
1721 r = test_object(f, p, needle);
1722 if (r < 0)
1723 return r;
1724
1725 if (r == TEST_FOUND)
1726 r = direction == DIRECTION_DOWN ? TEST_RIGHT : TEST_LEFT;
1727
1728 if (r == TEST_RIGHT)
1729 right = y;
1730 else
1731 left = y + 1;
1732 }
f268980d
LP
1733 }
1734
de190aef
LP
1735 for (;;) {
1736 if (left == right) {
1737 if (direction == DIRECTION_UP)
1738 subtract_one = true;
1739
1740 i = left;
1741 goto found;
1742 }
1743
1744 assert(left < right);
de190aef 1745 i = (left + right) / 2;
f268980d 1746
de190aef
LP
1747 p = le64toh(array->entry_array.items[i]);
1748 if (p <= 0)
1749 return -EBADMSG;
1750
1751 r = test_object(f, p, needle);
1752 if (r < 0)
1753 return r;
cec736d2 1754
de190aef
LP
1755 if (r == TEST_FOUND)
1756 r = direction == DIRECTION_DOWN ? TEST_RIGHT : TEST_LEFT;
1757
1758 if (r == TEST_RIGHT)
1759 right = i;
1760 else
1761 left = i + 1;
1762 }
1763 }
1764
2173cbf8 1765 if (k >= n) {
cbdca852
LP
1766 if (direction == DIRECTION_UP) {
1767 i = n;
1768 subtract_one = true;
1769 goto found;
1770 }
1771
cec736d2 1772 return 0;
cbdca852 1773 }
cec736d2 1774
de190aef
LP
1775 last_p = lp;
1776
1777 n -= k;
1778 t += k;
f268980d 1779 last_index = (uint64_t) -1;
de190aef 1780 a = le64toh(array->entry_array.next_entry_array_offset);
cec736d2
LP
1781 }
1782
1783 return 0;
de190aef
LP
1784
1785found:
1786 if (subtract_one && t == 0 && i == 0)
1787 return 0;
1788
a4bcff5b 1789 /* Let's cache this item for the next invocation */
af13a6b0 1790 chain_cache_put(f->chain_cache, ci, first, a, le64toh(array->entry_array.items[0]), t, subtract_one ? (i > 0 ? i-1 : (uint64_t) -1) : i);
a4bcff5b 1791
de190aef
LP
1792 if (subtract_one && i == 0)
1793 p = last_p;
1794 else if (subtract_one)
1795 p = le64toh(array->entry_array.items[i-1]);
1796 else
1797 p = le64toh(array->entry_array.items[i]);
1798
1799 r = journal_file_move_to_object(f, OBJECT_ENTRY, p, &o);
1800 if (r < 0)
1801 return r;
1802
1803 if (ret)
1804 *ret = o;
1805
1806 if (offset)
1807 *offset = p;
1808
1809 if (idx)
cbdca852 1810 *idx = t + i + (subtract_one ? -1 : 0);
de190aef
LP
1811
1812 return 1;
cec736d2
LP
1813}
1814
f268980d
LP
1815static int generic_array_bisect_plus_one(
1816 JournalFile *f,
1817 uint64_t extra,
1818 uint64_t first,
1819 uint64_t n,
1820 uint64_t needle,
1821 int (*test_object)(JournalFile *f, uint64_t p, uint64_t needle),
1822 direction_t direction,
1823 Object **ret,
1824 uint64_t *offset,
1825 uint64_t *idx) {
de190aef 1826
cec736d2 1827 int r;
cbdca852
LP
1828 bool step_back = false;
1829 Object *o;
cec736d2
LP
1830
1831 assert(f);
de190aef 1832 assert(test_object);
cec736d2 1833
de190aef
LP
1834 if (n <= 0)
1835 return 0;
cec736d2 1836
de190aef
LP
1837 /* This bisects the array in object 'first', but first checks
1838 * an extra */
de190aef
LP
1839 r = test_object(f, extra, needle);
1840 if (r < 0)
1841 return r;
a536e261
LP
1842
1843 if (r == TEST_FOUND)
1844 r = direction == DIRECTION_DOWN ? TEST_RIGHT : TEST_LEFT;
1845
cbdca852
LP
1846 /* if we are looking with DIRECTION_UP then we need to first
1847 see if in the actual array there is a matching entry, and
1848 return the last one of that. But if there isn't any we need
1849 to return this one. Hence remember this, and return it
1850 below. */
1851 if (r == TEST_LEFT)
1852 step_back = direction == DIRECTION_UP;
de190aef 1853
cbdca852
LP
1854 if (r == TEST_RIGHT) {
1855 if (direction == DIRECTION_DOWN)
1856 goto found;
1857 else
1858 return 0;
a536e261 1859 }
cec736d2 1860
de190aef
LP
1861 r = generic_array_bisect(f, first, n-1, needle, test_object, direction, ret, offset, idx);
1862
cbdca852
LP
1863 if (r == 0 && step_back)
1864 goto found;
1865
ecf68b1d 1866 if (r > 0 && idx)
de190aef
LP
1867 (*idx) ++;
1868
1869 return r;
cbdca852
LP
1870
1871found:
1872 r = journal_file_move_to_object(f, OBJECT_ENTRY, extra, &o);
1873 if (r < 0)
1874 return r;
1875
1876 if (ret)
1877 *ret = o;
1878
1879 if (offset)
1880 *offset = extra;
1881
1882 if (idx)
1883 *idx = 0;
1884
1885 return 1;
1886}
1887
44a6b1b6 1888_pure_ static int test_object_offset(JournalFile *f, uint64_t p, uint64_t needle) {
cbdca852
LP
1889 assert(f);
1890 assert(p > 0);
1891
1892 if (p == needle)
1893 return TEST_FOUND;
1894 else if (p < needle)
1895 return TEST_LEFT;
1896 else
1897 return TEST_RIGHT;
1898}
1899
de190aef
LP
1900static int test_object_seqnum(JournalFile *f, uint64_t p, uint64_t needle) {
1901 Object *o;
1902 int r;
1903
1904 assert(f);
1905 assert(p > 0);
1906
1907 r = journal_file_move_to_object(f, OBJECT_ENTRY, p, &o);
cec736d2
LP
1908 if (r < 0)
1909 return r;
1910
de190aef
LP
1911 if (le64toh(o->entry.seqnum) == needle)
1912 return TEST_FOUND;
1913 else if (le64toh(o->entry.seqnum) < needle)
1914 return TEST_LEFT;
1915 else
1916 return TEST_RIGHT;
1917}
cec736d2 1918
de190aef
LP
1919int journal_file_move_to_entry_by_seqnum(
1920 JournalFile *f,
1921 uint64_t seqnum,
1922 direction_t direction,
1923 Object **ret,
1924 uint64_t *offset) {
1925
1926 return generic_array_bisect(f,
1927 le64toh(f->header->entry_array_offset),
1928 le64toh(f->header->n_entries),
1929 seqnum,
1930 test_object_seqnum,
1931 direction,
1932 ret, offset, NULL);
1933}
cec736d2 1934
de190aef
LP
1935static int test_object_realtime(JournalFile *f, uint64_t p, uint64_t needle) {
1936 Object *o;
1937 int r;
1938
1939 assert(f);
1940 assert(p > 0);
1941
1942 r = journal_file_move_to_object(f, OBJECT_ENTRY, p, &o);
1943 if (r < 0)
1944 return r;
1945
1946 if (le64toh(o->entry.realtime) == needle)
1947 return TEST_FOUND;
1948 else if (le64toh(o->entry.realtime) < needle)
1949 return TEST_LEFT;
1950 else
1951 return TEST_RIGHT;
cec736d2
LP
1952}
1953
de190aef
LP
1954int journal_file_move_to_entry_by_realtime(
1955 JournalFile *f,
1956 uint64_t realtime,
1957 direction_t direction,
1958 Object **ret,
1959 uint64_t *offset) {
1960
1961 return generic_array_bisect(f,
1962 le64toh(f->header->entry_array_offset),
1963 le64toh(f->header->n_entries),
1964 realtime,
1965 test_object_realtime,
1966 direction,
1967 ret, offset, NULL);
1968}
1969
1970static int test_object_monotonic(JournalFile *f, uint64_t p, uint64_t needle) {
1971 Object *o;
1972 int r;
1973
1974 assert(f);
1975 assert(p > 0);
1976
1977 r = journal_file_move_to_object(f, OBJECT_ENTRY, p, &o);
1978 if (r < 0)
1979 return r;
1980
1981 if (le64toh(o->entry.monotonic) == needle)
1982 return TEST_FOUND;
1983 else if (le64toh(o->entry.monotonic) < needle)
1984 return TEST_LEFT;
1985 else
1986 return TEST_RIGHT;
1987}
1988
2a560338 1989static int find_data_object_by_boot_id(
47838ab3
ZJS
1990 JournalFile *f,
1991 sd_id128_t boot_id,
1992 Object **o,
1993 uint64_t *b) {
2a560338 1994
47838ab3
ZJS
1995 char t[sizeof("_BOOT_ID=")-1 + 32 + 1] = "_BOOT_ID=";
1996
1997 sd_id128_to_string(boot_id, t + 9);
1998 return journal_file_find_data_object(f, t, sizeof(t) - 1, o, b);
1999}
2000
de190aef
LP
2001int journal_file_move_to_entry_by_monotonic(
2002 JournalFile *f,
2003 sd_id128_t boot_id,
2004 uint64_t monotonic,
2005 direction_t direction,
2006 Object **ret,
2007 uint64_t *offset) {
2008
de190aef
LP
2009 Object *o;
2010 int r;
2011
cbdca852 2012 assert(f);
de190aef 2013
47838ab3 2014 r = find_data_object_by_boot_id(f, boot_id, &o, NULL);
de190aef
LP
2015 if (r < 0)
2016 return r;
cbdca852 2017 if (r == 0)
de190aef
LP
2018 return -ENOENT;
2019
2020 return generic_array_bisect_plus_one(f,
2021 le64toh(o->data.entry_offset),
2022 le64toh(o->data.entry_array_offset),
2023 le64toh(o->data.n_entries),
2024 monotonic,
2025 test_object_monotonic,
2026 direction,
2027 ret, offset, NULL);
2028}
2029
1fc605b0 2030void journal_file_reset_location(JournalFile *f) {
6573ef05 2031 f->location_type = LOCATION_HEAD;
1fc605b0 2032 f->current_offset = 0;
6573ef05
MS
2033 f->current_seqnum = 0;
2034 f->current_realtime = 0;
2035 f->current_monotonic = 0;
2036 zero(f->current_boot_id);
2037 f->current_xor_hash = 0;
2038}
2039
950c07d4 2040void journal_file_save_location(JournalFile *f, Object *o, uint64_t offset) {
6573ef05
MS
2041 f->location_type = LOCATION_SEEK;
2042 f->current_offset = offset;
2043 f->current_seqnum = le64toh(o->entry.seqnum);
2044 f->current_realtime = le64toh(o->entry.realtime);
2045 f->current_monotonic = le64toh(o->entry.monotonic);
2046 f->current_boot_id = o->entry.boot_id;
2047 f->current_xor_hash = le64toh(o->entry.xor_hash);
1fc605b0
MS
2048}
2049
d8ae66d7
MS
2050int journal_file_compare_locations(JournalFile *af, JournalFile *bf) {
2051 assert(af);
2052 assert(bf);
2053 assert(af->location_type == LOCATION_SEEK);
2054 assert(bf->location_type == LOCATION_SEEK);
2055
2056 /* If contents and timestamps match, these entries are
2057 * identical, even if the seqnum does not match */
2058 if (sd_id128_equal(af->current_boot_id, bf->current_boot_id) &&
2059 af->current_monotonic == bf->current_monotonic &&
2060 af->current_realtime == bf->current_realtime &&
2061 af->current_xor_hash == bf->current_xor_hash)
2062 return 0;
2063
2064 if (sd_id128_equal(af->header->seqnum_id, bf->header->seqnum_id)) {
2065
2066 /* If this is from the same seqnum source, compare
2067 * seqnums */
2068 if (af->current_seqnum < bf->current_seqnum)
2069 return -1;
2070 if (af->current_seqnum > bf->current_seqnum)
2071 return 1;
2072
2073 /* Wow! This is weird, different data but the same
2074 * seqnums? Something is borked, but let's make the
2075 * best of it and compare by time. */
2076 }
2077
2078 if (sd_id128_equal(af->current_boot_id, bf->current_boot_id)) {
2079
2080 /* If the boot id matches, compare monotonic time */
2081 if (af->current_monotonic < bf->current_monotonic)
2082 return -1;
2083 if (af->current_monotonic > bf->current_monotonic)
2084 return 1;
2085 }
2086
2087 /* Otherwise, compare UTC time */
2088 if (af->current_realtime < bf->current_realtime)
2089 return -1;
2090 if (af->current_realtime > bf->current_realtime)
2091 return 1;
2092
2093 /* Finally, compare by contents */
2094 if (af->current_xor_hash < bf->current_xor_hash)
2095 return -1;
2096 if (af->current_xor_hash > bf->current_xor_hash)
2097 return 1;
2098
2099 return 0;
2100}
2101
de190aef
LP
2102int journal_file_next_entry(
2103 JournalFile *f,
f534928a 2104 uint64_t p,
de190aef
LP
2105 direction_t direction,
2106 Object **ret, uint64_t *offset) {
2107
fb099c8d 2108 uint64_t i, n, ofs;
cec736d2
LP
2109 int r;
2110
2111 assert(f);
de190aef
LP
2112
2113 n = le64toh(f->header->n_entries);
2114 if (n <= 0)
2115 return 0;
cec736d2 2116
f534928a 2117 if (p == 0)
de190aef 2118 i = direction == DIRECTION_DOWN ? 0 : n - 1;
cec736d2 2119 else {
de190aef
LP
2120 r = generic_array_bisect(f,
2121 le64toh(f->header->entry_array_offset),
2122 le64toh(f->header->n_entries),
2123 p,
2124 test_object_offset,
2125 DIRECTION_DOWN,
2126 NULL, NULL,
2127 &i);
2128 if (r <= 0)
2129 return r;
2130
2131 if (direction == DIRECTION_DOWN) {
2132 if (i >= n - 1)
2133 return 0;
2134
2135 i++;
2136 } else {
2137 if (i <= 0)
2138 return 0;
2139
2140 i--;
2141 }
cec736d2
LP
2142 }
2143
de190aef 2144 /* And jump to it */
fb099c8d
ZJS
2145 r = generic_array_get(f,
2146 le64toh(f->header->entry_array_offset),
2147 i,
2148 ret, &ofs);
2149 if (r <= 0)
2150 return r;
2151
2152 if (p > 0 &&
2153 (direction == DIRECTION_DOWN ? ofs <= p : ofs >= p)) {
2154 log_debug("%s: entry array corrupted at entry %"PRIu64,
2155 f->path, i);
2156 return -EBADMSG;
2157 }
2158
2159 if (offset)
2160 *offset = ofs;
2161
2162 return 1;
de190aef 2163}
cec736d2 2164
de190aef
LP
2165int journal_file_next_entry_for_data(
2166 JournalFile *f,
2167 Object *o, uint64_t p,
2168 uint64_t data_offset,
2169 direction_t direction,
2170 Object **ret, uint64_t *offset) {
2171
2172 uint64_t n, i;
cec736d2 2173 int r;
de190aef 2174 Object *d;
cec736d2
LP
2175
2176 assert(f);
de190aef 2177 assert(p > 0 || !o);
cec736d2 2178
de190aef 2179 r = journal_file_move_to_object(f, OBJECT_DATA, data_offset, &d);
466ccd92 2180 if (r < 0)
de190aef 2181 return r;
cec736d2 2182
de190aef
LP
2183 n = le64toh(d->data.n_entries);
2184 if (n <= 0)
2185 return n;
cec736d2 2186
de190aef
LP
2187 if (!o)
2188 i = direction == DIRECTION_DOWN ? 0 : n - 1;
2189 else {
2190 if (o->object.type != OBJECT_ENTRY)
2191 return -EINVAL;
cec736d2 2192
de190aef
LP
2193 r = generic_array_bisect_plus_one(f,
2194 le64toh(d->data.entry_offset),
2195 le64toh(d->data.entry_array_offset),
2196 le64toh(d->data.n_entries),
2197 p,
2198 test_object_offset,
2199 DIRECTION_DOWN,
2200 NULL, NULL,
2201 &i);
2202
2203 if (r <= 0)
cec736d2
LP
2204 return r;
2205
de190aef
LP
2206 if (direction == DIRECTION_DOWN) {
2207 if (i >= n - 1)
2208 return 0;
cec736d2 2209
de190aef
LP
2210 i++;
2211 } else {
2212 if (i <= 0)
2213 return 0;
cec736d2 2214
de190aef
LP
2215 i--;
2216 }
cec736d2 2217
de190aef 2218 }
cec736d2 2219
de190aef
LP
2220 return generic_array_get_plus_one(f,
2221 le64toh(d->data.entry_offset),
2222 le64toh(d->data.entry_array_offset),
2223 i,
2224 ret, offset);
2225}
cec736d2 2226
cbdca852
LP
2227int journal_file_move_to_entry_by_offset_for_data(
2228 JournalFile *f,
2229 uint64_t data_offset,
2230 uint64_t p,
2231 direction_t direction,
2232 Object **ret, uint64_t *offset) {
2233
2234 int r;
2235 Object *d;
2236
2237 assert(f);
2238
2239 r = journal_file_move_to_object(f, OBJECT_DATA, data_offset, &d);
2240 if (r < 0)
2241 return r;
2242
2243 return generic_array_bisect_plus_one(f,
2244 le64toh(d->data.entry_offset),
2245 le64toh(d->data.entry_array_offset),
2246 le64toh(d->data.n_entries),
2247 p,
2248 test_object_offset,
2249 direction,
2250 ret, offset, NULL);
2251}
2252
2253int journal_file_move_to_entry_by_monotonic_for_data(
2254 JournalFile *f,
2255 uint64_t data_offset,
2256 sd_id128_t boot_id,
2257 uint64_t monotonic,
2258 direction_t direction,
2259 Object **ret, uint64_t *offset) {
2260
cbdca852
LP
2261 Object *o, *d;
2262 int r;
2263 uint64_t b, z;
2264
2265 assert(f);
2266
2267 /* First, seek by time */
47838ab3 2268 r = find_data_object_by_boot_id(f, boot_id, &o, &b);
cbdca852
LP
2269 if (r < 0)
2270 return r;
2271 if (r == 0)
2272 return -ENOENT;
2273
2274 r = generic_array_bisect_plus_one(f,
2275 le64toh(o->data.entry_offset),
2276 le64toh(o->data.entry_array_offset),
2277 le64toh(o->data.n_entries),
2278 monotonic,
2279 test_object_monotonic,
2280 direction,
2281 NULL, &z, NULL);
2282 if (r <= 0)
2283 return r;
2284
2285 /* And now, continue seeking until we find an entry that
2286 * exists in both bisection arrays */
2287
2288 for (;;) {
2289 Object *qo;
2290 uint64_t p, q;
2291
2292 r = journal_file_move_to_object(f, OBJECT_DATA, data_offset, &d);
2293 if (r < 0)
2294 return r;
2295
2296 r = generic_array_bisect_plus_one(f,
2297 le64toh(d->data.entry_offset),
2298 le64toh(d->data.entry_array_offset),
2299 le64toh(d->data.n_entries),
2300 z,
2301 test_object_offset,
2302 direction,
2303 NULL, &p, NULL);
2304 if (r <= 0)
2305 return r;
2306
2307 r = journal_file_move_to_object(f, OBJECT_DATA, b, &o);
2308 if (r < 0)
2309 return r;
2310
2311 r = generic_array_bisect_plus_one(f,
2312 le64toh(o->data.entry_offset),
2313 le64toh(o->data.entry_array_offset),
2314 le64toh(o->data.n_entries),
2315 p,
2316 test_object_offset,
2317 direction,
2318 &qo, &q, NULL);
2319
2320 if (r <= 0)
2321 return r;
2322
2323 if (p == q) {
2324 if (ret)
2325 *ret = qo;
2326 if (offset)
2327 *offset = q;
2328
2329 return 1;
2330 }
2331
2332 z = q;
2333 }
cbdca852
LP
2334}
2335
de190aef
LP
2336int journal_file_move_to_entry_by_seqnum_for_data(
2337 JournalFile *f,
2338 uint64_t data_offset,
2339 uint64_t seqnum,
2340 direction_t direction,
2341 Object **ret, uint64_t *offset) {
cec736d2 2342
de190aef
LP
2343 Object *d;
2344 int r;
cec736d2 2345
91a31dde
LP
2346 assert(f);
2347
de190aef 2348 r = journal_file_move_to_object(f, OBJECT_DATA, data_offset, &d);
91a31dde 2349 if (r < 0)
de190aef 2350 return r;
cec736d2 2351
de190aef
LP
2352 return generic_array_bisect_plus_one(f,
2353 le64toh(d->data.entry_offset),
2354 le64toh(d->data.entry_array_offset),
2355 le64toh(d->data.n_entries),
2356 seqnum,
2357 test_object_seqnum,
2358 direction,
2359 ret, offset, NULL);
2360}
cec736d2 2361
de190aef
LP
2362int journal_file_move_to_entry_by_realtime_for_data(
2363 JournalFile *f,
2364 uint64_t data_offset,
2365 uint64_t realtime,
2366 direction_t direction,
2367 Object **ret, uint64_t *offset) {
2368
2369 Object *d;
2370 int r;
2371
91a31dde
LP
2372 assert(f);
2373
de190aef 2374 r = journal_file_move_to_object(f, OBJECT_DATA, data_offset, &d);
91a31dde 2375 if (r < 0)
de190aef
LP
2376 return r;
2377
2378 return generic_array_bisect_plus_one(f,
2379 le64toh(d->data.entry_offset),
2380 le64toh(d->data.entry_array_offset),
2381 le64toh(d->data.n_entries),
2382 realtime,
2383 test_object_realtime,
2384 direction,
2385 ret, offset, NULL);
cec736d2
LP
2386}
2387
0284adc6 2388void journal_file_dump(JournalFile *f) {
7560fffc 2389 Object *o;
7560fffc 2390 int r;
0284adc6 2391 uint64_t p;
7560fffc
LP
2392
2393 assert(f);
2394
0284adc6 2395 journal_file_print_header(f);
7560fffc 2396
0284adc6
LP
2397 p = le64toh(f->header->header_size);
2398 while (p != 0) {
d05089d8 2399 r = journal_file_move_to_object(f, OBJECT_UNUSED, p, &o);
0284adc6
LP
2400 if (r < 0)
2401 goto fail;
7560fffc 2402
0284adc6 2403 switch (o->object.type) {
d98cc1f2 2404
0284adc6
LP
2405 case OBJECT_UNUSED:
2406 printf("Type: OBJECT_UNUSED\n");
2407 break;
d98cc1f2 2408
0284adc6
LP
2409 case OBJECT_DATA:
2410 printf("Type: OBJECT_DATA\n");
2411 break;
7560fffc 2412
3c1668da
LP
2413 case OBJECT_FIELD:
2414 printf("Type: OBJECT_FIELD\n");
2415 break;
2416
0284adc6 2417 case OBJECT_ENTRY:
507f22bd
ZJS
2418 printf("Type: OBJECT_ENTRY seqnum=%"PRIu64" monotonic=%"PRIu64" realtime=%"PRIu64"\n",
2419 le64toh(o->entry.seqnum),
2420 le64toh(o->entry.monotonic),
2421 le64toh(o->entry.realtime));
0284adc6 2422 break;
7560fffc 2423
0284adc6
LP
2424 case OBJECT_FIELD_HASH_TABLE:
2425 printf("Type: OBJECT_FIELD_HASH_TABLE\n");
2426 break;
7560fffc 2427
0284adc6
LP
2428 case OBJECT_DATA_HASH_TABLE:
2429 printf("Type: OBJECT_DATA_HASH_TABLE\n");
2430 break;
7560fffc 2431
0284adc6
LP
2432 case OBJECT_ENTRY_ARRAY:
2433 printf("Type: OBJECT_ENTRY_ARRAY\n");
2434 break;
7560fffc 2435
0284adc6 2436 case OBJECT_TAG:
507f22bd
ZJS
2437 printf("Type: OBJECT_TAG seqnum=%"PRIu64" epoch=%"PRIu64"\n",
2438 le64toh(o->tag.seqnum),
2439 le64toh(o->tag.epoch));
0284adc6 2440 break;
3c1668da
LP
2441
2442 default:
8facc349 2443 printf("Type: unknown (%i)\n", o->object.type);
3c1668da 2444 break;
0284adc6 2445 }
7560fffc 2446
d89c8fdf
ZJS
2447 if (o->object.flags & OBJECT_COMPRESSION_MASK)
2448 printf("Flags: %s\n",
2449 object_compressed_to_string(o->object.flags & OBJECT_COMPRESSION_MASK));
7560fffc 2450
0284adc6
LP
2451 if (p == le64toh(f->header->tail_object_offset))
2452 p = 0;
2453 else
2454 p = p + ALIGN64(le64toh(o->object.size));
2455 }
7560fffc 2456
0284adc6
LP
2457 return;
2458fail:
2459 log_error("File corrupt");
7560fffc
LP
2460}
2461
718fe4b1
ZJS
2462static const char* format_timestamp_safe(char *buf, size_t l, usec_t t) {
2463 const char *x;
2464
2465 x = format_timestamp(buf, l, t);
2466 if (x)
2467 return x;
2468 return " --- ";
2469}
2470
0284adc6 2471void journal_file_print_header(JournalFile *f) {
2765b7bb 2472 char a[33], b[33], c[33], d[33];
ed375beb 2473 char x[FORMAT_TIMESTAMP_MAX], y[FORMAT_TIMESTAMP_MAX], z[FORMAT_TIMESTAMP_MAX];
a1a03e30
LP
2474 struct stat st;
2475 char bytes[FORMAT_BYTES_MAX];
7560fffc
LP
2476
2477 assert(f);
7560fffc 2478
0284adc6
LP
2479 printf("File Path: %s\n"
2480 "File ID: %s\n"
2481 "Machine ID: %s\n"
2482 "Boot ID: %s\n"
2483 "Sequential Number ID: %s\n"
2484 "State: %s\n"
2485 "Compatible Flags:%s%s\n"
d89c8fdf 2486 "Incompatible Flags:%s%s%s\n"
507f22bd
ZJS
2487 "Header size: %"PRIu64"\n"
2488 "Arena size: %"PRIu64"\n"
2489 "Data Hash Table Size: %"PRIu64"\n"
2490 "Field Hash Table Size: %"PRIu64"\n"
0284adc6 2491 "Rotate Suggested: %s\n"
507f22bd
ZJS
2492 "Head Sequential Number: %"PRIu64"\n"
2493 "Tail Sequential Number: %"PRIu64"\n"
0284adc6 2494 "Head Realtime Timestamp: %s\n"
3223f44f 2495 "Tail Realtime Timestamp: %s\n"
ed375beb 2496 "Tail Monotonic Timestamp: %s\n"
507f22bd
ZJS
2497 "Objects: %"PRIu64"\n"
2498 "Entry Objects: %"PRIu64"\n",
0284adc6
LP
2499 f->path,
2500 sd_id128_to_string(f->header->file_id, a),
2501 sd_id128_to_string(f->header->machine_id, b),
2502 sd_id128_to_string(f->header->boot_id, c),
2765b7bb 2503 sd_id128_to_string(f->header->seqnum_id, d),
3223f44f
LP
2504 f->header->state == STATE_OFFLINE ? "OFFLINE" :
2505 f->header->state == STATE_ONLINE ? "ONLINE" :
2506 f->header->state == STATE_ARCHIVED ? "ARCHIVED" : "UNKNOWN",
8088cbd3 2507 JOURNAL_HEADER_SEALED(f->header) ? " SEALED" : "",
d89c8fdf
ZJS
2508 (le32toh(f->header->compatible_flags) & ~HEADER_COMPATIBLE_ANY) ? " ???" : "",
2509 JOURNAL_HEADER_COMPRESSED_XZ(f->header) ? " COMPRESSED-XZ" : "",
2510 JOURNAL_HEADER_COMPRESSED_LZ4(f->header) ? " COMPRESSED-LZ4" : "",
2511 (le32toh(f->header->incompatible_flags) & ~HEADER_INCOMPATIBLE_ANY) ? " ???" : "",
507f22bd
ZJS
2512 le64toh(f->header->header_size),
2513 le64toh(f->header->arena_size),
2514 le64toh(f->header->data_hash_table_size) / sizeof(HashItem),
2515 le64toh(f->header->field_hash_table_size) / sizeof(HashItem),
fb0951b0 2516 yes_no(journal_file_rotate_suggested(f, 0)),
507f22bd
ZJS
2517 le64toh(f->header->head_entry_seqnum),
2518 le64toh(f->header->tail_entry_seqnum),
718fe4b1
ZJS
2519 format_timestamp_safe(x, sizeof(x), le64toh(f->header->head_entry_realtime)),
2520 format_timestamp_safe(y, sizeof(y), le64toh(f->header->tail_entry_realtime)),
ed375beb 2521 format_timespan(z, sizeof(z), le64toh(f->header->tail_entry_monotonic), USEC_PER_MSEC),
507f22bd
ZJS
2522 le64toh(f->header->n_objects),
2523 le64toh(f->header->n_entries));
7560fffc 2524
0284adc6 2525 if (JOURNAL_HEADER_CONTAINS(f->header, n_data))
507f22bd 2526 printf("Data Objects: %"PRIu64"\n"
0284adc6 2527 "Data Hash Table Fill: %.1f%%\n",
507f22bd 2528 le64toh(f->header->n_data),
0284adc6 2529 100.0 * (double) le64toh(f->header->n_data) / ((double) (le64toh(f->header->data_hash_table_size) / sizeof(HashItem))));
7560fffc 2530
0284adc6 2531 if (JOURNAL_HEADER_CONTAINS(f->header, n_fields))
507f22bd 2532 printf("Field Objects: %"PRIu64"\n"
0284adc6 2533 "Field Hash Table Fill: %.1f%%\n",
507f22bd 2534 le64toh(f->header->n_fields),
0284adc6 2535 100.0 * (double) le64toh(f->header->n_fields) / ((double) (le64toh(f->header->field_hash_table_size) / sizeof(HashItem))));
3223f44f
LP
2536
2537 if (JOURNAL_HEADER_CONTAINS(f->header, n_tags))
507f22bd
ZJS
2538 printf("Tag Objects: %"PRIu64"\n",
2539 le64toh(f->header->n_tags));
3223f44f 2540 if (JOURNAL_HEADER_CONTAINS(f->header, n_entry_arrays))
507f22bd
ZJS
2541 printf("Entry Array Objects: %"PRIu64"\n",
2542 le64toh(f->header->n_entry_arrays));
a1a03e30
LP
2543
2544 if (fstat(f->fd, &st) >= 0)
2545 printf("Disk usage: %s\n", format_bytes(bytes, sizeof(bytes), (off_t) st.st_blocks * 512ULL));
7560fffc
LP
2546}
2547
fc68c929
LP
2548static int journal_file_warn_btrfs(JournalFile *f) {
2549 unsigned attrs;
2550 int r;
2551
2552 assert(f);
2553
2554 /* Before we write anything, check if the COW logic is turned
2555 * off on btrfs. Given our write pattern that is quite
2556 * unfriendly to COW file systems this should greatly improve
2557 * performance on COW file systems, such as btrfs, at the
2558 * expense of data integrity features (which shouldn't be too
2559 * bad, given that we do our own checksumming). */
2560
2561 r = btrfs_is_filesystem(f->fd);
2562 if (r < 0)
2563 return log_warning_errno(r, "Failed to determine if journal is on btrfs: %m");
2564 if (!r)
2565 return 0;
2566
2567 r = read_attr_fd(f->fd, &attrs);
2568 if (r < 0)
2569 return log_warning_errno(r, "Failed to read file attributes: %m");
2570
2571 if (attrs & FS_NOCOW_FL) {
2572 log_debug("Detected btrfs file system with copy-on-write disabled, all is good.");
2573 return 0;
2574 }
2575
2576 log_notice("Creating journal file %s on a btrfs file system, and copy-on-write is enabled. "
2577 "This is likely to slow down journal access substantially, please consider turning "
2578 "off the copy-on-write file attribute on the journal directory, using chattr +C.", f->path);
2579
2580 return 1;
2581}
2582
0284adc6
LP
2583int journal_file_open(
2584 const char *fname,
2585 int flags,
2586 mode_t mode,
2587 bool compress,
baed47c3 2588 bool seal,
0284adc6
LP
2589 JournalMetrics *metrics,
2590 MMapCache *mmap_cache,
2591 JournalFile *template,
2592 JournalFile **ret) {
7560fffc 2593
fa6ac760 2594 bool newly_created = false;
0284adc6 2595 JournalFile *f;
fa6ac760 2596 void *h;
0284adc6 2597 int r;
7560fffc 2598
0284adc6 2599 assert(fname);
0559d3a5 2600 assert(ret);
7560fffc 2601
0284adc6
LP
2602 if ((flags & O_ACCMODE) != O_RDONLY &&
2603 (flags & O_ACCMODE) != O_RDWR)
2604 return -EINVAL;
7560fffc 2605
a0108012
LP
2606 if (!endswith(fname, ".journal") &&
2607 !endswith(fname, ".journal~"))
0284adc6 2608 return -EINVAL;
7560fffc 2609
0284adc6
LP
2610 f = new0(JournalFile, 1);
2611 if (!f)
2612 return -ENOMEM;
7560fffc 2613
0284adc6
LP
2614 f->fd = -1;
2615 f->mode = mode;
7560fffc 2616
0284adc6
LP
2617 f->flags = flags;
2618 f->prot = prot_from_flags(flags);
2619 f->writable = (flags & O_ACCMODE) != O_RDONLY;
92261977 2620#if defined(HAVE_LZ4)
d89c8fdf
ZJS
2621 f->compress_lz4 = compress;
2622#elif defined(HAVE_XZ)
2623 f->compress_xz = compress;
48b61739 2624#endif
49a32d43 2625#ifdef HAVE_GCRYPT
baed47c3 2626 f->seal = seal;
49a32d43 2627#endif
7560fffc 2628
0284adc6
LP
2629 if (mmap_cache)
2630 f->mmap = mmap_cache_ref(mmap_cache);
2631 else {
84168d80 2632 f->mmap = mmap_cache_new();
0284adc6
LP
2633 if (!f->mmap) {
2634 r = -ENOMEM;
2635 goto fail;
2636 }
2637 }
7560fffc 2638
0284adc6
LP
2639 f->path = strdup(fname);
2640 if (!f->path) {
2641 r = -ENOMEM;
2642 goto fail;
2643 }
7560fffc 2644
4743015d 2645 f->chain_cache = ordered_hashmap_new(&uint64_hash_ops);
a4bcff5b
LP
2646 if (!f->chain_cache) {
2647 r = -ENOMEM;
2648 goto fail;
2649 }
2650
0284adc6
LP
2651 f->fd = open(f->path, f->flags|O_CLOEXEC, f->mode);
2652 if (f->fd < 0) {
2653 r = -errno;
2654 goto fail;
7560fffc 2655 }
7560fffc 2656
2678031a
LP
2657 r = journal_file_fstat(f);
2658 if (r < 0)
0284adc6 2659 goto fail;
7560fffc 2660
0284adc6 2661 if (f->last_stat.st_size == 0 && f->writable) {
11689d2a 2662
fc68c929 2663 (void) journal_file_warn_btrfs(f);
11689d2a 2664
fb0951b0
LP
2665 /* Let's attach the creation time to the journal file,
2666 * so that the vacuuming code knows the age of this
2667 * file even if the file might end up corrupted one
2668 * day... Ideally we'd just use the creation time many
2669 * file systems maintain for each file, but there is
2670 * currently no usable API to query this, hence let's
2671 * emulate this via extended attributes. If extended
2672 * attributes are not supported we'll just skip this,
7517e174 2673 * and rely solely on mtime/atime/ctime of the file. */
fb0951b0 2674
d61b600d 2675 fd_setcrtime(f->fd, 0);
7560fffc 2676
feb12d3e 2677#ifdef HAVE_GCRYPT
0284adc6 2678 /* Try to load the FSPRG state, and if we can't, then
baed47c3 2679 * just don't do sealing */
49a32d43
LP
2680 if (f->seal) {
2681 r = journal_file_fss_load(f);
2682 if (r < 0)
2683 f->seal = false;
2684 }
feb12d3e 2685#endif
7560fffc 2686
0284adc6
LP
2687 r = journal_file_init_header(f, template);
2688 if (r < 0)
2689 goto fail;
7560fffc 2690
2678031a
LP
2691 r = journal_file_fstat(f);
2692 if (r < 0)
0284adc6 2693 goto fail;
fb0951b0
LP
2694
2695 newly_created = true;
0284adc6 2696 }
7560fffc 2697
0284adc6
LP
2698 if (f->last_stat.st_size < (off_t) HEADER_SIZE_MIN) {
2699 r = -EIO;
2700 goto fail;
2701 }
7560fffc 2702
fa6ac760 2703 r = mmap_cache_get(f->mmap, f->fd, f->prot, CONTEXT_HEADER, true, 0, PAGE_ALIGN(sizeof(Header)), &f->last_stat, &h);
977eaa1e 2704 if (r < 0)
0284adc6 2705 goto fail;
7560fffc 2706
fa6ac760
LP
2707 f->header = h;
2708
0284adc6
LP
2709 if (!newly_created) {
2710 r = journal_file_verify_header(f);
2711 if (r < 0)
2712 goto fail;
2713 }
7560fffc 2714
feb12d3e 2715#ifdef HAVE_GCRYPT
0284adc6 2716 if (!newly_created && f->writable) {
baed47c3 2717 r = journal_file_fss_load(f);
0284adc6
LP
2718 if (r < 0)
2719 goto fail;
2720 }
feb12d3e 2721#endif
cec736d2
LP
2722
2723 if (f->writable) {
4a92baf3
LP
2724 if (metrics) {
2725 journal_default_metrics(metrics, f->fd);
2726 f->metrics = *metrics;
2727 } else if (template)
2728 f->metrics = template->metrics;
2729
cec736d2
LP
2730 r = journal_file_refresh_header(f);
2731 if (r < 0)
2732 goto fail;
2733 }
2734
feb12d3e 2735#ifdef HAVE_GCRYPT
baed47c3 2736 r = journal_file_hmac_setup(f);
14d10188
LP
2737 if (r < 0)
2738 goto fail;
feb12d3e 2739#endif
14d10188 2740
cec736d2 2741 if (newly_created) {
de190aef 2742 r = journal_file_setup_field_hash_table(f);
cec736d2
LP
2743 if (r < 0)
2744 goto fail;
2745
de190aef 2746 r = journal_file_setup_data_hash_table(f);
cec736d2
LP
2747 if (r < 0)
2748 goto fail;
7560fffc 2749
feb12d3e 2750#ifdef HAVE_GCRYPT
7560fffc
LP
2751 r = journal_file_append_first_tag(f);
2752 if (r < 0)
2753 goto fail;
feb12d3e 2754#endif
cec736d2
LP
2755 }
2756
fa6ac760
LP
2757 if (mmap_cache_got_sigbus(f->mmap, f->fd)) {
2758 r = -EIO;
2759 goto fail;
2760 }
2761
0559d3a5 2762 *ret = f;
cec736d2
LP
2763 return 0;
2764
2765fail:
fa6ac760
LP
2766 if (f->fd >= 0 && mmap_cache_got_sigbus(f->mmap, f->fd))
2767 r = -EIO;
2768
cec736d2
LP
2769 journal_file_close(f);
2770
2771 return r;
2772}
0ac38b70 2773
baed47c3 2774int journal_file_rotate(JournalFile **f, bool compress, bool seal) {
57535f47 2775 _cleanup_free_ char *p = NULL;
0ac38b70
LP
2776 size_t l;
2777 JournalFile *old_file, *new_file = NULL;
2778 int r;
2779
2780 assert(f);
2781 assert(*f);
2782
2783 old_file = *f;
2784
2785 if (!old_file->writable)
2786 return -EINVAL;
2787
2788 if (!endswith(old_file->path, ".journal"))
2789 return -EINVAL;
2790
2791 l = strlen(old_file->path);
57535f47
ZJS
2792 r = asprintf(&p, "%.*s@" SD_ID128_FORMAT_STR "-%016"PRIx64"-%016"PRIx64".journal",
2793 (int) l - 8, old_file->path,
2794 SD_ID128_FORMAT_VAL(old_file->header->seqnum_id),
2795 le64toh((*f)->header->head_entry_seqnum),
2796 le64toh((*f)->header->head_entry_realtime));
2797 if (r < 0)
0ac38b70
LP
2798 return -ENOMEM;
2799
2678031a
LP
2800 /* Try to rename the file to the archived version. If the file
2801 * already was deleted, we'll get ENOENT, let's ignore that
2802 * case. */
0ac38b70 2803 r = rename(old_file->path, p);
2678031a 2804 if (r < 0 && errno != ENOENT)
0ac38b70
LP
2805 return -errno;
2806
ccdbaf91 2807 old_file->header->state = STATE_ARCHIVED;
0ac38b70 2808
f27a3864
LP
2809 /* Currently, btrfs is not very good with out write patterns
2810 * and fragments heavily. Let's defrag our journal files when
2811 * we archive them */
2812 old_file->defrag_on_close = true;
2813
baed47c3 2814 r = journal_file_open(old_file->path, old_file->flags, old_file->mode, compress, seal, NULL, old_file->mmap, old_file, &new_file);
0ac38b70
LP
2815 journal_file_close(old_file);
2816
2817 *f = new_file;
2818 return r;
2819}
2820
9447a7f1
LP
2821int journal_file_open_reliably(
2822 const char *fname,
2823 int flags,
2824 mode_t mode,
7560fffc 2825 bool compress,
baed47c3 2826 bool seal,
4a92baf3 2827 JournalMetrics *metrics,
27370278 2828 MMapCache *mmap_cache,
9447a7f1
LP
2829 JournalFile *template,
2830 JournalFile **ret) {
2831
2832 int r;
2833 size_t l;
ed375beb 2834 _cleanup_free_ char *p = NULL;
9447a7f1 2835
baed47c3 2836 r = journal_file_open(fname, flags, mode, compress, seal,
27370278 2837 metrics, mmap_cache, template, ret);
288359db
ZJS
2838 if (!IN_SET(r,
2839 -EBADMSG, /* corrupted */
2840 -ENODATA, /* truncated */
2841 -EHOSTDOWN, /* other machine */
2842 -EPROTONOSUPPORT, /* incompatible feature */
2843 -EBUSY, /* unclean shutdown */
2844 -ESHUTDOWN, /* already archived */
2845 -EIO, /* IO error, including SIGBUS on mmap */
2846 -EIDRM /* File has been deleted */))
9447a7f1
LP
2847 return r;
2848
2849 if ((flags & O_ACCMODE) == O_RDONLY)
2850 return r;
2851
2852 if (!(flags & O_CREAT))
2853 return r;
2854
7560fffc
LP
2855 if (!endswith(fname, ".journal"))
2856 return r;
2857
5c70eab4
LP
2858 /* The file is corrupted. Rotate it away and try it again (but only once) */
2859
9447a7f1 2860 l = strlen(fname);
d587eca5 2861 if (asprintf(&p, "%.*s@%016"PRIx64 "-%016"PRIx64 ".journal~",
57535f47 2862 (int) l - 8, fname,
d587eca5 2863 now(CLOCK_REALTIME),
9bf3b535 2864 random_u64()) < 0)
9447a7f1
LP
2865 return -ENOMEM;
2866
2867 r = rename(fname, p);
9447a7f1
LP
2868 if (r < 0)
2869 return -errno;
2870
f27a3864
LP
2871 /* btrfs doesn't cope well with our write pattern and
2872 * fragments heavily. Let's defrag all files we rotate */
11689d2a
LP
2873
2874 (void) chattr_path(p, false, FS_NOCOW_FL);
f27a3864
LP
2875 (void) btrfs_defrag(p);
2876
a1a1898f 2877 log_warning("File %s corrupted or uncleanly shut down, renaming and replacing.", fname);
9447a7f1 2878
baed47c3 2879 return journal_file_open(fname, flags, mode, compress, seal,
27370278 2880 metrics, mmap_cache, template, ret);
9447a7f1
LP
2881}
2882
cf244689
LP
2883int journal_file_copy_entry(JournalFile *from, JournalFile *to, Object *o, uint64_t p, uint64_t *seqnum, Object **ret, uint64_t *offset) {
2884 uint64_t i, n;
2885 uint64_t q, xor_hash = 0;
2886 int r;
2887 EntryItem *items;
2888 dual_timestamp ts;
2889
2890 assert(from);
2891 assert(to);
2892 assert(o);
2893 assert(p);
2894
2895 if (!to->writable)
2896 return -EPERM;
2897
2898 ts.monotonic = le64toh(o->entry.monotonic);
2899 ts.realtime = le64toh(o->entry.realtime);
2900
cf244689 2901 n = journal_file_entry_n_items(o);
4faa7004
TA
2902 /* alloca() can't take 0, hence let's allocate at least one */
2903 items = alloca(sizeof(EntryItem) * MAX(1u, n));
cf244689
LP
2904
2905 for (i = 0; i < n; i++) {
4fd052ae
FC
2906 uint64_t l, h;
2907 le64_t le_hash;
cf244689
LP
2908 size_t t;
2909 void *data;
2910 Object *u;
2911
2912 q = le64toh(o->entry.items[i].object_offset);
2913 le_hash = o->entry.items[i].hash;
2914
2915 r = journal_file_move_to_object(from, OBJECT_DATA, q, &o);
2916 if (r < 0)
2917 return r;
2918
2919 if (le_hash != o->data.hash)
2920 return -EBADMSG;
2921
2922 l = le64toh(o->object.size) - offsetof(Object, data.payload);
2923 t = (size_t) l;
2924
2925 /* We hit the limit on 32bit machines */
2926 if ((uint64_t) t != l)
2927 return -E2BIG;
2928
d89c8fdf 2929 if (o->object.flags & OBJECT_COMPRESSION_MASK) {
3b1a55e1 2930#if defined(HAVE_XZ) || defined(HAVE_LZ4)
a7f7d1bd 2931 size_t rsize = 0;
cf244689 2932
d89c8fdf
ZJS
2933 r = decompress_blob(o->object.flags & OBJECT_COMPRESSION_MASK,
2934 o->data.payload, l, &from->compress_buffer, &from->compress_buffer_size, &rsize, 0);
2935 if (r < 0)
2936 return r;
cf244689
LP
2937
2938 data = from->compress_buffer;
2939 l = rsize;
3b1a55e1
ZJS
2940#else
2941 return -EPROTONOSUPPORT;
2942#endif
cf244689
LP
2943 } else
2944 data = o->data.payload;
2945
2946 r = journal_file_append_data(to, data, l, &u, &h);
2947 if (r < 0)
2948 return r;
2949
2950 xor_hash ^= le64toh(u->data.hash);
2951 items[i].object_offset = htole64(h);
2952 items[i].hash = u->data.hash;
2953
2954 r = journal_file_move_to_object(from, OBJECT_ENTRY, p, &o);
2955 if (r < 0)
2956 return r;
2957 }
2958
fa6ac760
LP
2959 r = journal_file_append_entry_internal(to, &ts, xor_hash, items, n, seqnum, ret, offset);
2960
2961 if (mmap_cache_got_sigbus(to->mmap, to->fd))
2962 return -EIO;
2963
2964 return r;
cf244689 2965}
babfc091
LP
2966
2967void journal_default_metrics(JournalMetrics *m, int fd) {
2968 uint64_t fs_size = 0;
2969 struct statvfs ss;
a7bc2c2a 2970 char a[FORMAT_BYTES_MAX], b[FORMAT_BYTES_MAX], c[FORMAT_BYTES_MAX], d[FORMAT_BYTES_MAX];
babfc091
LP
2971
2972 assert(m);
2973 assert(fd >= 0);
2974
2975 if (fstatvfs(fd, &ss) >= 0)
2976 fs_size = ss.f_frsize * ss.f_blocks;
2977
2978 if (m->max_use == (uint64_t) -1) {
2979
2980 if (fs_size > 0) {
2981 m->max_use = PAGE_ALIGN(fs_size / 10); /* 10% of file system size */
2982
2983 if (m->max_use > DEFAULT_MAX_USE_UPPER)
2984 m->max_use = DEFAULT_MAX_USE_UPPER;
2985
2986 if (m->max_use < DEFAULT_MAX_USE_LOWER)
2987 m->max_use = DEFAULT_MAX_USE_LOWER;
2988 } else
2989 m->max_use = DEFAULT_MAX_USE_LOWER;
2990 } else {
2991 m->max_use = PAGE_ALIGN(m->max_use);
2992
2993 if (m->max_use < JOURNAL_FILE_SIZE_MIN*2)
2994 m->max_use = JOURNAL_FILE_SIZE_MIN*2;
2995 }
2996
2997 if (m->max_size == (uint64_t) -1) {
2998 m->max_size = PAGE_ALIGN(m->max_use / 8); /* 8 chunks */
2999
3000 if (m->max_size > DEFAULT_MAX_SIZE_UPPER)
3001 m->max_size = DEFAULT_MAX_SIZE_UPPER;
3002 } else
3003 m->max_size = PAGE_ALIGN(m->max_size);
3004
3005 if (m->max_size < JOURNAL_FILE_SIZE_MIN)
3006 m->max_size = JOURNAL_FILE_SIZE_MIN;
3007
3008 if (m->max_size*2 > m->max_use)
3009 m->max_use = m->max_size*2;
3010
3011 if (m->min_size == (uint64_t) -1)
3012 m->min_size = JOURNAL_FILE_SIZE_MIN;
3013 else {
3014 m->min_size = PAGE_ALIGN(m->min_size);
3015
3016 if (m->min_size < JOURNAL_FILE_SIZE_MIN)
3017 m->min_size = JOURNAL_FILE_SIZE_MIN;
3018
3019 if (m->min_size > m->max_size)
3020 m->max_size = m->min_size;
3021 }
3022
3023 if (m->keep_free == (uint64_t) -1) {
3024
3025 if (fs_size > 0) {
8621b110 3026 m->keep_free = PAGE_ALIGN(fs_size * 3 / 20); /* 15% of file system size */
babfc091
LP
3027
3028 if (m->keep_free > DEFAULT_KEEP_FREE_UPPER)
3029 m->keep_free = DEFAULT_KEEP_FREE_UPPER;
3030
3031 } else
3032 m->keep_free = DEFAULT_KEEP_FREE;
3033 }
3034
2b43f939
LP
3035 log_debug("Fixed max_use=%s max_size=%s min_size=%s keep_free=%s",
3036 format_bytes(a, sizeof(a), m->max_use),
3037 format_bytes(b, sizeof(b), m->max_size),
3038 format_bytes(c, sizeof(c), m->min_size),
3039 format_bytes(d, sizeof(d), m->keep_free));
babfc091 3040}
08984293
LP
3041
3042int journal_file_get_cutoff_realtime_usec(JournalFile *f, usec_t *from, usec_t *to) {
08984293
LP
3043 assert(f);
3044 assert(from || to);
3045
3046 if (from) {
162566a4
LP
3047 if (f->header->head_entry_realtime == 0)
3048 return -ENOENT;
08984293 3049
162566a4 3050 *from = le64toh(f->header->head_entry_realtime);
08984293
LP
3051 }
3052
3053 if (to) {
162566a4
LP
3054 if (f->header->tail_entry_realtime == 0)
3055 return -ENOENT;
08984293 3056
162566a4 3057 *to = le64toh(f->header->tail_entry_realtime);
08984293
LP
3058 }
3059
3060 return 1;
3061}
3062
3063int journal_file_get_cutoff_monotonic_usec(JournalFile *f, sd_id128_t boot_id, usec_t *from, usec_t *to) {
08984293
LP
3064 Object *o;
3065 uint64_t p;
3066 int r;
3067
3068 assert(f);
3069 assert(from || to);
3070
47838ab3 3071 r = find_data_object_by_boot_id(f, boot_id, &o, &p);
08984293
LP
3072 if (r <= 0)
3073 return r;
3074
3075 if (le64toh(o->data.n_entries) <= 0)
3076 return 0;
3077
3078 if (from) {
3079 r = journal_file_move_to_object(f, OBJECT_ENTRY, le64toh(o->data.entry_offset), &o);
3080 if (r < 0)
3081 return r;
3082
3083 *from = le64toh(o->entry.monotonic);
3084 }
3085
3086 if (to) {
3087 r = journal_file_move_to_object(f, OBJECT_DATA, p, &o);
3088 if (r < 0)
3089 return r;
3090
3091 r = generic_array_get_plus_one(f,
3092 le64toh(o->data.entry_offset),
3093 le64toh(o->data.entry_array_offset),
3094 le64toh(o->data.n_entries)-1,
3095 &o, NULL);
3096 if (r <= 0)
3097 return r;
3098
3099 *to = le64toh(o->entry.monotonic);
3100 }
3101
3102 return 1;
3103}
dca6219e 3104
fb0951b0 3105bool journal_file_rotate_suggested(JournalFile *f, usec_t max_file_usec) {
dca6219e
LP
3106 assert(f);
3107
3108 /* If we gained new header fields we gained new features,
3109 * hence suggest a rotation */
361f9cbc
LP
3110 if (le64toh(f->header->header_size) < sizeof(Header)) {
3111 log_debug("%s uses an outdated header, suggesting rotation.", f->path);
dca6219e 3112 return true;
361f9cbc 3113 }
dca6219e
LP
3114
3115 /* Let's check if the hash tables grew over a certain fill
3116 * level (75%, borrowing this value from Java's hash table
3117 * implementation), and if so suggest a rotation. To calculate
3118 * the fill level we need the n_data field, which only exists
3119 * in newer versions. */
3120
3121 if (JOURNAL_HEADER_CONTAINS(f->header, n_data))
361f9cbc 3122 if (le64toh(f->header->n_data) * 4ULL > (le64toh(f->header->data_hash_table_size) / sizeof(HashItem)) * 3ULL) {
507f22bd 3123 log_debug("Data hash table of %s has a fill level at %.1f (%"PRIu64" of %"PRIu64" items, %llu file size, %"PRIu64" bytes per hash table item), suggesting rotation.",
361f9cbc
LP
3124 f->path,
3125 100.0 * (double) le64toh(f->header->n_data) / ((double) (le64toh(f->header->data_hash_table_size) / sizeof(HashItem))),
507f22bd
ZJS
3126 le64toh(f->header->n_data),
3127 le64toh(f->header->data_hash_table_size) / sizeof(HashItem),
3128 (unsigned long long) f->last_stat.st_size,
3129 f->last_stat.st_size / le64toh(f->header->n_data));
dca6219e 3130 return true;
361f9cbc 3131 }
dca6219e
LP
3132
3133 if (JOURNAL_HEADER_CONTAINS(f->header, n_fields))
361f9cbc 3134 if (le64toh(f->header->n_fields) * 4ULL > (le64toh(f->header->field_hash_table_size) / sizeof(HashItem)) * 3ULL) {
507f22bd 3135 log_debug("Field hash table of %s has a fill level at %.1f (%"PRIu64" of %"PRIu64" items), suggesting rotation.",
361f9cbc
LP
3136 f->path,
3137 100.0 * (double) le64toh(f->header->n_fields) / ((double) (le64toh(f->header->field_hash_table_size) / sizeof(HashItem))),
507f22bd
ZJS
3138 le64toh(f->header->n_fields),
3139 le64toh(f->header->field_hash_table_size) / sizeof(HashItem));
dca6219e 3140 return true;
361f9cbc 3141 }
dca6219e 3142
0598fd4a
LP
3143 /* Are the data objects properly indexed by field objects? */
3144 if (JOURNAL_HEADER_CONTAINS(f->header, n_data) &&
3145 JOURNAL_HEADER_CONTAINS(f->header, n_fields) &&
3146 le64toh(f->header->n_data) > 0 &&
3147 le64toh(f->header->n_fields) == 0)
3148 return true;
3149
fb0951b0
LP
3150 if (max_file_usec > 0) {
3151 usec_t t, h;
3152
3153 h = le64toh(f->header->head_entry_realtime);
3154 t = now(CLOCK_REALTIME);
3155
3156 if (h > 0 && t > h + max_file_usec)
3157 return true;
3158 }
3159
dca6219e
LP
3160 return false;
3161}