]> git.ipfire.org Git - thirdparty/systemd.git/blame - src/journal/journal-file.c
sd-*.h: clean up exported (or to-be-exported) header files
[thirdparty/systemd.git] / src / journal / journal-file.c
CommitLineData
cec736d2
LP
1/*-*- Mode: C; c-basic-offset: 8; indent-tabs-mode: nil -*-*/
2
3/***
4 This file is part of systemd.
5
6 Copyright 2011 Lennart Poettering
7
8 systemd is free software; you can redistribute it and/or modify it
5430f7f2
LP
9 under the terms of the GNU Lesser General Public License as published by
10 the Free Software Foundation; either version 2.1 of the License, or
cec736d2
LP
11 (at your option) any later version.
12
13 systemd is distributed in the hope that it will be useful, but
14 WITHOUT ANY WARRANTY; without even the implied warranty of
15 MERCHANTABILITY or FITNESS FOR A PARTICULAR PURPOSE. See the GNU
5430f7f2 16 Lesser General Public License for more details.
cec736d2 17
5430f7f2 18 You should have received a copy of the GNU Lesser General Public License
cec736d2
LP
19 along with systemd; If not, see <http://www.gnu.org/licenses/>.
20***/
21
cec736d2 22#include <errno.h>
cec736d2 23#include <fcntl.h>
11689d2a 24#include <linux/fs.h>
07630cea
LP
25#include <stddef.h>
26#include <sys/mman.h>
27#include <sys/statvfs.h>
28#include <sys/uio.h>
29#include <unistd.h>
fb0951b0 30
f27a3864 31#include "btrfs-util.h"
07630cea 32#include "compress.h"
0284adc6 33#include "journal-authenticate.h"
07630cea 34#include "journal-def.h"
cec736d2 35#include "lookup3.h"
3df3e884 36#include "random-util.h"
07630cea
LP
37#include "string-util.h"
38#include "journal-file.h"
cec736d2 39
4a92baf3
LP
40#define DEFAULT_DATA_HASH_TABLE_SIZE (2047ULL*sizeof(HashItem))
41#define DEFAULT_FIELD_HASH_TABLE_SIZE (333ULL*sizeof(HashItem))
cec736d2 42
be19b7df 43#define COMPRESSION_SIZE_THRESHOLD (512ULL)
807e17f0 44
babfc091 45/* This is the minimum journal file size */
253f59df 46#define JOURNAL_FILE_SIZE_MIN (4ULL*1024ULL*1024ULL) /* 4 MiB */
babfc091
LP
47
48/* These are the lower and upper bounds if we deduce the max_use value
49 * from the file system size */
50#define DEFAULT_MAX_USE_LOWER (1ULL*1024ULL*1024ULL) /* 1 MiB */
51#define DEFAULT_MAX_USE_UPPER (4ULL*1024ULL*1024ULL*1024ULL) /* 4 GiB */
52
8580d1f7
LP
53/* This is the default minimal use limit, how much we'll use even if keep_free suggests otherwise. */
54#define DEFAULT_MIN_USE (1ULL*1024ULL*1024ULL) /* 1 MiB */
55
babfc091 56/* This is the upper bound if we deduce max_size from max_use */
71100051 57#define DEFAULT_MAX_SIZE_UPPER (128ULL*1024ULL*1024ULL) /* 128 MiB */
babfc091
LP
58
59/* This is the upper bound if we deduce the keep_free value from the
60 * file system size */
61#define DEFAULT_KEEP_FREE_UPPER (4ULL*1024ULL*1024ULL*1024ULL) /* 4 GiB */
62
63/* This is the keep_free value when we can't determine the system
64 * size */
65#define DEFAULT_KEEP_FREE (1024ULL*1024ULL) /* 1 MB */
66
8580d1f7
LP
67/* This is the default maximum number of journal files to keep around. */
68#define DEFAULT_N_MAX_FILES (100)
69
dca6219e
LP
70/* n_data was the first entry we added after the initial file format design */
71#define HEADER_SIZE_MIN ALIGN64(offsetof(Header, n_data))
cec736d2 72
a4bcff5b
LP
73/* How many entries to keep in the entry array chain cache at max */
74#define CHAIN_CACHE_MAX 20
75
a676e665
LP
76/* How much to increase the journal file size at once each time we allocate something new. */
77#define FILE_SIZE_INCREASE (8ULL*1024ULL*1024ULL) /* 8MB */
78
2678031a
LP
79/* Reread fstat() of the file for detecting deletions at least this often */
80#define LAST_STAT_REFRESH_USEC (5*USEC_PER_SEC)
81
fa6ac760
LP
82/* The mmap context to use for the header we pick as one above the last defined typed */
83#define CONTEXT_HEADER _OBJECT_TYPE_MAX
84
9588bc32 85static int journal_file_set_online(JournalFile *f) {
26687bf8
OS
86 assert(f);
87
88 if (!f->writable)
89 return -EPERM;
90
91 if (!(f->fd >= 0 && f->header))
92 return -EINVAL;
93
fa6ac760
LP
94 if (mmap_cache_got_sigbus(f->mmap, f->fd))
95 return -EIO;
96
26687bf8
OS
97 switch(f->header->state) {
98 case STATE_ONLINE:
99 return 0;
100
101 case STATE_OFFLINE:
102 f->header->state = STATE_ONLINE;
103 fsync(f->fd);
104 return 0;
105
106 default:
107 return -EINVAL;
108 }
109}
110
111int journal_file_set_offline(JournalFile *f) {
112 assert(f);
113
114 if (!f->writable)
115 return -EPERM;
116
117 if (!(f->fd >= 0 && f->header))
118 return -EINVAL;
119
120 if (f->header->state != STATE_ONLINE)
121 return 0;
122
123 fsync(f->fd);
124
fa6ac760
LP
125 if (mmap_cache_got_sigbus(f->mmap, f->fd))
126 return -EIO;
127
26687bf8
OS
128 f->header->state = STATE_OFFLINE;
129
fa6ac760
LP
130 if (mmap_cache_got_sigbus(f->mmap, f->fd))
131 return -EIO;
132
26687bf8
OS
133 fsync(f->fd);
134
135 return 0;
136}
137
804ae586 138JournalFile* journal_file_close(JournalFile *f) {
de190aef 139 assert(f);
cec736d2 140
feb12d3e 141#ifdef HAVE_GCRYPT
b0af6f41 142 /* Write the final tag */
c586dbf1 143 if (f->seal && f->writable)
b0af6f41 144 journal_file_append_tag(f);
feb12d3e 145#endif
b0af6f41 146
26687bf8 147 journal_file_set_offline(f);
cec736d2 148
fa6ac760
LP
149 if (f->mmap && f->fd >= 0)
150 mmap_cache_close_fd(f->mmap, f->fd);
cec736d2 151
11689d2a
LP
152 if (f->fd >= 0 && f->defrag_on_close) {
153
154 /* Be friendly to btrfs: turn COW back on again now,
155 * and defragment the file. We won't write to the file
156 * ever again, hence remove all fragmentation, and
157 * reenable all the good bits COW usually provides
158 * (such as data checksumming). */
159
1ed8f8c1 160 (void) chattr_fd(f->fd, 0, FS_NOCOW_FL);
11689d2a
LP
161 (void) btrfs_defrag_fd(f->fd);
162 }
f27a3864 163
03e334a1 164 safe_close(f->fd);
cec736d2 165 free(f->path);
807e17f0 166
16e9f408
LP
167 if (f->mmap)
168 mmap_cache_unref(f->mmap);
169
4743015d 170 ordered_hashmap_free_free(f->chain_cache);
a4bcff5b 171
d89c8fdf 172#if defined(HAVE_XZ) || defined(HAVE_LZ4)
807e17f0
LP
173 free(f->compress_buffer);
174#endif
175
7560fffc 176#ifdef HAVE_GCRYPT
baed47c3
LP
177 if (f->fss_file)
178 munmap(f->fss_file, PAGE_ALIGN(f->fss_file_size));
dc4ebc07 179 else
b7c9ae91
LP
180 free(f->fsprg_state);
181
182 free(f->fsprg_seed);
7560fffc
LP
183
184 if (f->hmac)
185 gcry_md_close(f->hmac);
186#endif
187
cec736d2 188 free(f);
804ae586 189 return NULL;
cec736d2
LP
190}
191
0ac38b70 192static int journal_file_init_header(JournalFile *f, JournalFile *template) {
d89c8fdf 193 Header h = {};
cec736d2
LP
194 ssize_t k;
195 int r;
196
197 assert(f);
198
7560fffc 199 memcpy(h.signature, HEADER_SIGNATURE, 8);
23b0b2b2 200 h.header_size = htole64(ALIGN64(sizeof(h)));
cec736d2 201
d89c8fdf
ZJS
202 h.incompatible_flags |= htole32(
203 f->compress_xz * HEADER_INCOMPATIBLE_COMPRESSED_XZ |
204 f->compress_lz4 * HEADER_INCOMPATIBLE_COMPRESSED_LZ4);
7560fffc 205
d89c8fdf
ZJS
206 h.compatible_flags = htole32(
207 f->seal * HEADER_COMPATIBLE_SEALED);
7560fffc 208
cec736d2
LP
209 r = sd_id128_randomize(&h.file_id);
210 if (r < 0)
211 return r;
212
0ac38b70
LP
213 if (template) {
214 h.seqnum_id = template->header->seqnum_id;
beec0085 215 h.tail_entry_seqnum = template->header->tail_entry_seqnum;
0ac38b70
LP
216 } else
217 h.seqnum_id = h.file_id;
cec736d2
LP
218
219 k = pwrite(f->fd, &h, sizeof(h), 0);
220 if (k < 0)
221 return -errno;
222
223 if (k != sizeof(h))
224 return -EIO;
225
226 return 0;
227}
228
229static int journal_file_refresh_header(JournalFile *f) {
de190aef 230 sd_id128_t boot_id;
fa6ac760 231 int r;
cec736d2
LP
232
233 assert(f);
234
235 r = sd_id128_get_machine(&f->header->machine_id);
236 if (r < 0)
237 return r;
238
de190aef 239 r = sd_id128_get_boot(&boot_id);
cec736d2
LP
240 if (r < 0)
241 return r;
242
de190aef
LP
243 if (sd_id128_equal(boot_id, f->header->boot_id))
244 f->tail_entry_monotonic_valid = true;
245
246 f->header->boot_id = boot_id;
247
fa6ac760 248 r = journal_file_set_online(f);
b788cc23 249
7560fffc 250 /* Sync the online state to disk */
a676e665 251 fsync(f->fd);
b788cc23 252
fa6ac760 253 return r;
cec736d2
LP
254}
255
256static int journal_file_verify_header(JournalFile *f) {
d89c8fdf
ZJS
257 uint32_t flags;
258
cec736d2
LP
259 assert(f);
260
7560fffc 261 if (memcmp(f->header->signature, HEADER_SIGNATURE, 8))
cec736d2
LP
262 return -EBADMSG;
263
7560fffc
LP
264 /* In both read and write mode we refuse to open files with
265 * incompatible flags we don't know */
d89c8fdf
ZJS
266 flags = le32toh(f->header->incompatible_flags);
267 if (flags & ~HEADER_INCOMPATIBLE_SUPPORTED) {
268 if (flags & ~HEADER_INCOMPATIBLE_ANY)
269 log_debug("Journal file %s has unknown incompatible flags %"PRIx32,
270 f->path, flags & ~HEADER_INCOMPATIBLE_ANY);
271 flags = (flags & HEADER_INCOMPATIBLE_ANY) & ~HEADER_INCOMPATIBLE_SUPPORTED;
272 if (flags)
273 log_debug("Journal file %s uses incompatible flags %"PRIx32
274 " disabled at compilation time.", f->path, flags);
cec736d2 275 return -EPROTONOSUPPORT;
d89c8fdf 276 }
cec736d2 277
7560fffc
LP
278 /* When open for writing we refuse to open files with
279 * compatible flags, too */
d89c8fdf
ZJS
280 flags = le32toh(f->header->compatible_flags);
281 if (f->writable && (flags & ~HEADER_COMPATIBLE_SUPPORTED)) {
282 if (flags & ~HEADER_COMPATIBLE_ANY)
283 log_debug("Journal file %s has unknown compatible flags %"PRIx32,
284 f->path, flags & ~HEADER_COMPATIBLE_ANY);
285 flags = (flags & HEADER_COMPATIBLE_ANY) & ~HEADER_COMPATIBLE_SUPPORTED;
286 if (flags)
287 log_debug("Journal file %s uses compatible flags %"PRIx32
288 " disabled at compilation time.", f->path, flags);
289 return -EPROTONOSUPPORT;
7560fffc
LP
290 }
291
db11ac1a
LP
292 if (f->header->state >= _STATE_MAX)
293 return -EBADMSG;
294
dca6219e
LP
295 /* The first addition was n_data, so check that we are at least this large */
296 if (le64toh(f->header->header_size) < HEADER_SIZE_MIN)
23b0b2b2
LP
297 return -EBADMSG;
298
8088cbd3 299 if (JOURNAL_HEADER_SEALED(f->header) && !JOURNAL_HEADER_CONTAINS(f->header, n_entry_arrays))
beec0085
LP
300 return -EBADMSG;
301
db11ac1a
LP
302 if ((le64toh(f->header->header_size) + le64toh(f->header->arena_size)) > (uint64_t) f->last_stat.st_size)
303 return -ENODATA;
304
305 if (le64toh(f->header->tail_object_offset) > (le64toh(f->header->header_size) + le64toh(f->header->arena_size)))
306 return -ENODATA;
307
7762e02b
LP
308 if (!VALID64(le64toh(f->header->data_hash_table_offset)) ||
309 !VALID64(le64toh(f->header->field_hash_table_offset)) ||
310 !VALID64(le64toh(f->header->tail_object_offset)) ||
311 !VALID64(le64toh(f->header->entry_array_offset)))
312 return -ENODATA;
313
cec736d2 314 if (f->writable) {
ccdbaf91 315 uint8_t state;
cec736d2
LP
316 sd_id128_t machine_id;
317 int r;
318
319 r = sd_id128_get_machine(&machine_id);
320 if (r < 0)
321 return r;
322
323 if (!sd_id128_equal(machine_id, f->header->machine_id))
324 return -EHOSTDOWN;
325
de190aef 326 state = f->header->state;
cec736d2 327
71fa6f00
LP
328 if (state == STATE_ONLINE) {
329 log_debug("Journal file %s is already online. Assuming unclean closing.", f->path);
330 return -EBUSY;
331 } else if (state == STATE_ARCHIVED)
cec736d2 332 return -ESHUTDOWN;
71fa6f00 333 else if (state != STATE_OFFLINE) {
8facc349 334 log_debug("Journal file %s has unknown state %i.", f->path, state);
71fa6f00
LP
335 return -EBUSY;
336 }
cec736d2
LP
337 }
338
d89c8fdf
ZJS
339 f->compress_xz = JOURNAL_HEADER_COMPRESSED_XZ(f->header);
340 f->compress_lz4 = JOURNAL_HEADER_COMPRESSED_LZ4(f->header);
c586dbf1 341
f1889c91 342 f->seal = JOURNAL_HEADER_SEALED(f->header);
7560fffc 343
cec736d2
LP
344 return 0;
345}
346
2678031a
LP
347static int journal_file_fstat(JournalFile *f) {
348 assert(f);
349 assert(f->fd >= 0);
350
351 if (fstat(f->fd, &f->last_stat) < 0)
352 return -errno;
353
354 f->last_stat_usec = now(CLOCK_MONOTONIC);
355
356 /* Refuse appending to files that are already deleted */
357 if (f->last_stat.st_nlink <= 0)
358 return -EIDRM;
359
360 return 0;
361}
362
cec736d2 363static int journal_file_allocate(JournalFile *f, uint64_t offset, uint64_t size) {
eda4b58b 364 uint64_t old_size, new_size;
fec2aa2f 365 int r;
cec736d2
LP
366
367 assert(f);
368
cec736d2 369 /* We assume that this file is not sparse, and we know that
38ac38b2 370 * for sure, since we always call posix_fallocate()
cec736d2
LP
371 * ourselves */
372
fa6ac760
LP
373 if (mmap_cache_got_sigbus(f->mmap, f->fd))
374 return -EIO;
375
cec736d2 376 old_size =
23b0b2b2 377 le64toh(f->header->header_size) +
cec736d2
LP
378 le64toh(f->header->arena_size);
379
bc85bfee 380 new_size = PAGE_ALIGN(offset + size);
23b0b2b2
LP
381 if (new_size < le64toh(f->header->header_size))
382 new_size = le64toh(f->header->header_size);
bc85bfee 383
2678031a
LP
384 if (new_size <= old_size) {
385
386 /* We already pre-allocated enough space, but before
387 * we write to it, let's check with fstat() if the
388 * file got deleted, in order make sure we don't throw
389 * away the data immediately. Don't check fstat() for
390 * all writes though, but only once ever 10s. */
391
392 if (f->last_stat_usec + LAST_STAT_REFRESH_USEC > now(CLOCK_MONOTONIC))
393 return 0;
394
395 return journal_file_fstat(f);
396 }
397
398 /* Allocate more space. */
cec736d2 399
a676e665 400 if (f->metrics.max_size > 0 && new_size > f->metrics.max_size)
bc85bfee 401 return -E2BIG;
cec736d2 402
a676e665 403 if (new_size > f->metrics.min_size && f->metrics.keep_free > 0) {
cec736d2
LP
404 struct statvfs svfs;
405
406 if (fstatvfs(f->fd, &svfs) >= 0) {
407 uint64_t available;
408
070052ab 409 available = LESS_BY((uint64_t) svfs.f_bfree * (uint64_t) svfs.f_bsize, f->metrics.keep_free);
cec736d2
LP
410
411 if (new_size - old_size > available)
412 return -E2BIG;
413 }
414 }
415
eda4b58b
LP
416 /* Increase by larger blocks at once */
417 new_size = ((new_size+FILE_SIZE_INCREASE-1) / FILE_SIZE_INCREASE) * FILE_SIZE_INCREASE;
418 if (f->metrics.max_size > 0 && new_size > f->metrics.max_size)
419 new_size = f->metrics.max_size;
420
bc85bfee
LP
421 /* Note that the glibc fallocate() fallback is very
422 inefficient, hence we try to minimize the allocation area
423 as we can. */
fec2aa2f
GV
424 r = posix_fallocate(f->fd, old_size, new_size - old_size);
425 if (r != 0)
426 return -r;
cec736d2 427
23b0b2b2 428 f->header->arena_size = htole64(new_size - le64toh(f->header->header_size));
cec736d2 429
2678031a 430 return journal_file_fstat(f);
cec736d2
LP
431}
432
78519831 433static unsigned type_to_context(ObjectType type) {
d3d3208f 434 /* One context for each type, plus one catch-all for the rest */
69adae51 435 assert_cc(_OBJECT_TYPE_MAX <= MMAP_CACHE_MAX_CONTEXTS);
fa6ac760 436 assert_cc(CONTEXT_HEADER < MMAP_CACHE_MAX_CONTEXTS);
d05089d8 437 return type > OBJECT_UNUSED && type < _OBJECT_TYPE_MAX ? type : 0;
d3d3208f
MS
438}
439
7a9dabea 440static int journal_file_move_to(JournalFile *f, ObjectType type, bool keep_always, uint64_t offset, uint64_t size, void **ret) {
2678031a
LP
441 int r;
442
cec736d2 443 assert(f);
cec736d2
LP
444 assert(ret);
445
7762e02b
LP
446 if (size <= 0)
447 return -EINVAL;
448
2a59ea54 449 /* Avoid SIGBUS on invalid accesses */
4bbdcdb3
LP
450 if (offset + size > (uint64_t) f->last_stat.st_size) {
451 /* Hmm, out of range? Let's refresh the fstat() data
452 * first, before we trust that check. */
453
2678031a
LP
454 r = journal_file_fstat(f);
455 if (r < 0)
456 return r;
457
458 if (offset + size > (uint64_t) f->last_stat.st_size)
4bbdcdb3
LP
459 return -EADDRNOTAVAIL;
460 }
461
7a9dabea 462 return mmap_cache_get(f->mmap, f->fd, f->prot, type_to_context(type), keep_always, offset, size, &f->last_stat, ret);
cec736d2
LP
463}
464
16e9f408
LP
465static uint64_t minimum_header_size(Object *o) {
466
b8e891e6 467 static const uint64_t table[] = {
16e9f408
LP
468 [OBJECT_DATA] = sizeof(DataObject),
469 [OBJECT_FIELD] = sizeof(FieldObject),
470 [OBJECT_ENTRY] = sizeof(EntryObject),
471 [OBJECT_DATA_HASH_TABLE] = sizeof(HashTableObject),
472 [OBJECT_FIELD_HASH_TABLE] = sizeof(HashTableObject),
473 [OBJECT_ENTRY_ARRAY] = sizeof(EntryArrayObject),
474 [OBJECT_TAG] = sizeof(TagObject),
475 };
476
477 if (o->object.type >= ELEMENTSOF(table) || table[o->object.type] <= 0)
478 return sizeof(ObjectHeader);
479
480 return table[o->object.type];
481}
482
78519831 483int journal_file_move_to_object(JournalFile *f, ObjectType type, uint64_t offset, Object **ret) {
cec736d2
LP
484 int r;
485 void *t;
486 Object *o;
487 uint64_t s;
488
489 assert(f);
490 assert(ret);
491
db11ac1a
LP
492 /* Objects may only be located at multiple of 64 bit */
493 if (!VALID64(offset))
494 return -EFAULT;
495
7a9dabea 496 r = journal_file_move_to(f, type, false, offset, sizeof(ObjectHeader), &t);
cec736d2
LP
497 if (r < 0)
498 return r;
499
500 o = (Object*) t;
501 s = le64toh(o->object.size);
502
503 if (s < sizeof(ObjectHeader))
504 return -EBADMSG;
505
16e9f408
LP
506 if (o->object.type <= OBJECT_UNUSED)
507 return -EBADMSG;
508
509 if (s < minimum_header_size(o))
510 return -EBADMSG;
511
d05089d8 512 if (type > OBJECT_UNUSED && o->object.type != type)
cec736d2
LP
513 return -EBADMSG;
514
515 if (s > sizeof(ObjectHeader)) {
7a9dabea 516 r = journal_file_move_to(f, type, false, offset, s, &t);
cec736d2
LP
517 if (r < 0)
518 return r;
519
520 o = (Object*) t;
521 }
522
cec736d2
LP
523 *ret = o;
524 return 0;
525}
526
d98cc1f2 527static uint64_t journal_file_entry_seqnum(JournalFile *f, uint64_t *seqnum) {
cec736d2
LP
528 uint64_t r;
529
530 assert(f);
531
beec0085 532 r = le64toh(f->header->tail_entry_seqnum) + 1;
c2373f84
LP
533
534 if (seqnum) {
de190aef 535 /* If an external seqnum counter was passed, we update
c2373f84
LP
536 * both the local and the external one, and set it to
537 * the maximum of both */
538
539 if (*seqnum + 1 > r)
540 r = *seqnum + 1;
541
542 *seqnum = r;
543 }
544
beec0085 545 f->header->tail_entry_seqnum = htole64(r);
cec736d2 546
beec0085
LP
547 if (f->header->head_entry_seqnum == 0)
548 f->header->head_entry_seqnum = htole64(r);
de190aef 549
cec736d2
LP
550 return r;
551}
552
78519831 553int journal_file_append_object(JournalFile *f, ObjectType type, uint64_t size, Object **ret, uint64_t *offset) {
cec736d2
LP
554 int r;
555 uint64_t p;
556 Object *tail, *o;
557 void *t;
558
559 assert(f);
d05089d8 560 assert(type > OBJECT_UNUSED && type < _OBJECT_TYPE_MAX);
cec736d2
LP
561 assert(size >= sizeof(ObjectHeader));
562 assert(offset);
563 assert(ret);
564
26687bf8
OS
565 r = journal_file_set_online(f);
566 if (r < 0)
567 return r;
568
cec736d2 569 p = le64toh(f->header->tail_object_offset);
cec736d2 570 if (p == 0)
23b0b2b2 571 p = le64toh(f->header->header_size);
cec736d2 572 else {
d05089d8 573 r = journal_file_move_to_object(f, OBJECT_UNUSED, p, &tail);
cec736d2
LP
574 if (r < 0)
575 return r;
576
577 p += ALIGN64(le64toh(tail->object.size));
578 }
579
580 r = journal_file_allocate(f, p, size);
581 if (r < 0)
582 return r;
583
fcde2389 584 r = journal_file_move_to(f, type, false, p, size, &t);
cec736d2
LP
585 if (r < 0)
586 return r;
587
588 o = (Object*) t;
589
590 zero(o->object);
de190aef 591 o->object.type = type;
cec736d2
LP
592 o->object.size = htole64(size);
593
594 f->header->tail_object_offset = htole64(p);
cec736d2
LP
595 f->header->n_objects = htole64(le64toh(f->header->n_objects) + 1);
596
597 *ret = o;
598 *offset = p;
599
600 return 0;
601}
602
de190aef 603static int journal_file_setup_data_hash_table(JournalFile *f) {
cec736d2
LP
604 uint64_t s, p;
605 Object *o;
606 int r;
607
608 assert(f);
609
070052ab
LP
610 /* We estimate that we need 1 hash table entry per 768 bytes
611 of journal file and we want to make sure we never get
612 beyond 75% fill level. Calculate the hash table size for
613 the maximum file size based on these metrics. */
4a92baf3 614
dfabe643 615 s = (f->metrics.max_size * 4 / 768 / 3) * sizeof(HashItem);
4a92baf3
LP
616 if (s < DEFAULT_DATA_HASH_TABLE_SIZE)
617 s = DEFAULT_DATA_HASH_TABLE_SIZE;
618
507f22bd 619 log_debug("Reserving %"PRIu64" entries in hash table.", s / sizeof(HashItem));
4a92baf3 620
de190aef
LP
621 r = journal_file_append_object(f,
622 OBJECT_DATA_HASH_TABLE,
623 offsetof(Object, hash_table.items) + s,
624 &o, &p);
cec736d2
LP
625 if (r < 0)
626 return r;
627
29804cc1 628 memzero(o->hash_table.items, s);
cec736d2 629
de190aef
LP
630 f->header->data_hash_table_offset = htole64(p + offsetof(Object, hash_table.items));
631 f->header->data_hash_table_size = htole64(s);
cec736d2
LP
632
633 return 0;
634}
635
de190aef 636static int journal_file_setup_field_hash_table(JournalFile *f) {
cec736d2
LP
637 uint64_t s, p;
638 Object *o;
639 int r;
640
641 assert(f);
642
3c1668da
LP
643 /* We use a fixed size hash table for the fields as this
644 * number should grow very slowly only */
645
de190aef
LP
646 s = DEFAULT_FIELD_HASH_TABLE_SIZE;
647 r = journal_file_append_object(f,
648 OBJECT_FIELD_HASH_TABLE,
649 offsetof(Object, hash_table.items) + s,
650 &o, &p);
cec736d2
LP
651 if (r < 0)
652 return r;
653
29804cc1 654 memzero(o->hash_table.items, s);
cec736d2 655
de190aef
LP
656 f->header->field_hash_table_offset = htole64(p + offsetof(Object, hash_table.items));
657 f->header->field_hash_table_size = htole64(s);
cec736d2
LP
658
659 return 0;
660}
661
dade37d4 662int journal_file_map_data_hash_table(JournalFile *f) {
cec736d2
LP
663 uint64_t s, p;
664 void *t;
665 int r;
666
667 assert(f);
668
dade37d4
LP
669 if (f->data_hash_table)
670 return 0;
671
de190aef
LP
672 p = le64toh(f->header->data_hash_table_offset);
673 s = le64toh(f->header->data_hash_table_size);
cec736d2 674
de190aef 675 r = journal_file_move_to(f,
16e9f408 676 OBJECT_DATA_HASH_TABLE,
fcde2389 677 true,
de190aef
LP
678 p, s,
679 &t);
cec736d2
LP
680 if (r < 0)
681 return r;
682
de190aef 683 f->data_hash_table = t;
cec736d2
LP
684 return 0;
685}
686
dade37d4 687int journal_file_map_field_hash_table(JournalFile *f) {
cec736d2
LP
688 uint64_t s, p;
689 void *t;
690 int r;
691
692 assert(f);
693
dade37d4
LP
694 if (f->field_hash_table)
695 return 0;
696
de190aef
LP
697 p = le64toh(f->header->field_hash_table_offset);
698 s = le64toh(f->header->field_hash_table_size);
cec736d2 699
de190aef 700 r = journal_file_move_to(f,
16e9f408 701 OBJECT_FIELD_HASH_TABLE,
fcde2389 702 true,
de190aef
LP
703 p, s,
704 &t);
cec736d2
LP
705 if (r < 0)
706 return r;
707
de190aef 708 f->field_hash_table = t;
cec736d2
LP
709 return 0;
710}
711
3c1668da
LP
712static int journal_file_link_field(
713 JournalFile *f,
714 Object *o,
715 uint64_t offset,
716 uint64_t hash) {
717
805d1486 718 uint64_t p, h, m;
3c1668da
LP
719 int r;
720
721 assert(f);
722 assert(o);
723 assert(offset > 0);
724
725 if (o->object.type != OBJECT_FIELD)
726 return -EINVAL;
727
805d1486
LP
728 m = le64toh(f->header->field_hash_table_size) / sizeof(HashItem);
729 if (m <= 0)
730 return -EBADMSG;
3c1668da 731
805d1486 732 /* This might alter the window we are looking at */
3c1668da
LP
733 o->field.next_hash_offset = o->field.head_data_offset = 0;
734
805d1486 735 h = hash % m;
3c1668da
LP
736 p = le64toh(f->field_hash_table[h].tail_hash_offset);
737 if (p == 0)
738 f->field_hash_table[h].head_hash_offset = htole64(offset);
739 else {
740 r = journal_file_move_to_object(f, OBJECT_FIELD, p, &o);
741 if (r < 0)
742 return r;
743
744 o->field.next_hash_offset = htole64(offset);
745 }
746
747 f->field_hash_table[h].tail_hash_offset = htole64(offset);
748
749 if (JOURNAL_HEADER_CONTAINS(f->header, n_fields))
750 f->header->n_fields = htole64(le64toh(f->header->n_fields) + 1);
751
752 return 0;
753}
754
755static int journal_file_link_data(
756 JournalFile *f,
757 Object *o,
758 uint64_t offset,
759 uint64_t hash) {
760
805d1486 761 uint64_t p, h, m;
cec736d2
LP
762 int r;
763
764 assert(f);
765 assert(o);
766 assert(offset > 0);
b588975f
LP
767
768 if (o->object.type != OBJECT_DATA)
769 return -EINVAL;
cec736d2 770
805d1486
LP
771 m = le64toh(f->header->data_hash_table_size) / sizeof(HashItem);
772 if (m <= 0)
773 return -EBADMSG;
48496df6 774
805d1486 775 /* This might alter the window we are looking at */
de190aef
LP
776 o->data.next_hash_offset = o->data.next_field_offset = 0;
777 o->data.entry_offset = o->data.entry_array_offset = 0;
778 o->data.n_entries = 0;
cec736d2 779
805d1486 780 h = hash % m;
8db4213e 781 p = le64toh(f->data_hash_table[h].tail_hash_offset);
3c1668da 782 if (p == 0)
cec736d2 783 /* Only entry in the hash table is easy */
de190aef 784 f->data_hash_table[h].head_hash_offset = htole64(offset);
3c1668da 785 else {
48496df6
LP
786 /* Move back to the previous data object, to patch in
787 * pointer */
cec736d2 788
de190aef 789 r = journal_file_move_to_object(f, OBJECT_DATA, p, &o);
cec736d2
LP
790 if (r < 0)
791 return r;
792
de190aef 793 o->data.next_hash_offset = htole64(offset);
cec736d2
LP
794 }
795
de190aef 796 f->data_hash_table[h].tail_hash_offset = htole64(offset);
cec736d2 797
dca6219e
LP
798 if (JOURNAL_HEADER_CONTAINS(f->header, n_data))
799 f->header->n_data = htole64(le64toh(f->header->n_data) + 1);
800
cec736d2
LP
801 return 0;
802}
803
3c1668da
LP
804int journal_file_find_field_object_with_hash(
805 JournalFile *f,
806 const void *field, uint64_t size, uint64_t hash,
807 Object **ret, uint64_t *offset) {
808
805d1486 809 uint64_t p, osize, h, m;
3c1668da
LP
810 int r;
811
812 assert(f);
813 assert(field && size > 0);
814
dade37d4
LP
815 /* If the field hash table is empty, we can't find anything */
816 if (le64toh(f->header->field_hash_table_size) <= 0)
817 return 0;
818
819 /* Map the field hash table, if it isn't mapped yet. */
820 r = journal_file_map_field_hash_table(f);
821 if (r < 0)
822 return r;
823
3c1668da
LP
824 osize = offsetof(Object, field.payload) + size;
825
805d1486 826 m = le64toh(f->header->field_hash_table_size) / sizeof(HashItem);
805d1486 827 if (m <= 0)
3c1668da
LP
828 return -EBADMSG;
829
805d1486 830 h = hash % m;
3c1668da
LP
831 p = le64toh(f->field_hash_table[h].head_hash_offset);
832
833 while (p > 0) {
834 Object *o;
835
836 r = journal_file_move_to_object(f, OBJECT_FIELD, p, &o);
837 if (r < 0)
838 return r;
839
840 if (le64toh(o->field.hash) == hash &&
841 le64toh(o->object.size) == osize &&
842 memcmp(o->field.payload, field, size) == 0) {
843
844 if (ret)
845 *ret = o;
846 if (offset)
847 *offset = p;
848
849 return 1;
850 }
851
852 p = le64toh(o->field.next_hash_offset);
853 }
854
855 return 0;
856}
857
858int journal_file_find_field_object(
859 JournalFile *f,
860 const void *field, uint64_t size,
861 Object **ret, uint64_t *offset) {
862
863 uint64_t hash;
864
865 assert(f);
866 assert(field && size > 0);
867
868 hash = hash64(field, size);
869
870 return journal_file_find_field_object_with_hash(f,
871 field, size, hash,
872 ret, offset);
873}
874
de190aef
LP
875int journal_file_find_data_object_with_hash(
876 JournalFile *f,
877 const void *data, uint64_t size, uint64_t hash,
878 Object **ret, uint64_t *offset) {
48496df6 879
805d1486 880 uint64_t p, osize, h, m;
cec736d2
LP
881 int r;
882
883 assert(f);
884 assert(data || size == 0);
885
dade37d4
LP
886 /* If there's no data hash table, then there's no entry. */
887 if (le64toh(f->header->data_hash_table_size) <= 0)
888 return 0;
889
890 /* Map the data hash table, if it isn't mapped yet. */
891 r = journal_file_map_data_hash_table(f);
892 if (r < 0)
893 return r;
894
cec736d2
LP
895 osize = offsetof(Object, data.payload) + size;
896
805d1486
LP
897 m = le64toh(f->header->data_hash_table_size) / sizeof(HashItem);
898 if (m <= 0)
bc85bfee
LP
899 return -EBADMSG;
900
805d1486 901 h = hash % m;
de190aef 902 p = le64toh(f->data_hash_table[h].head_hash_offset);
cec736d2 903
de190aef
LP
904 while (p > 0) {
905 Object *o;
cec736d2 906
de190aef 907 r = journal_file_move_to_object(f, OBJECT_DATA, p, &o);
cec736d2
LP
908 if (r < 0)
909 return r;
910
807e17f0 911 if (le64toh(o->data.hash) != hash)
85a131e8 912 goto next;
807e17f0 913
d89c8fdf 914 if (o->object.flags & OBJECT_COMPRESSION_MASK) {
3b1a55e1 915#if defined(HAVE_XZ) || defined(HAVE_LZ4)
fa1c4b51 916 uint64_t l;
a7f7d1bd 917 size_t rsize = 0;
cec736d2 918
807e17f0
LP
919 l = le64toh(o->object.size);
920 if (l <= offsetof(Object, data.payload))
cec736d2
LP
921 return -EBADMSG;
922
807e17f0
LP
923 l -= offsetof(Object, data.payload);
924
d89c8fdf
ZJS
925 r = decompress_blob(o->object.flags & OBJECT_COMPRESSION_MASK,
926 o->data.payload, l, &f->compress_buffer, &f->compress_buffer_size, &rsize, 0);
927 if (r < 0)
928 return r;
807e17f0 929
b785c858 930 if (rsize == size &&
807e17f0
LP
931 memcmp(f->compress_buffer, data, size) == 0) {
932
933 if (ret)
934 *ret = o;
935
936 if (offset)
937 *offset = p;
938
939 return 1;
940 }
3b1a55e1
ZJS
941#else
942 return -EPROTONOSUPPORT;
943#endif
807e17f0
LP
944 } else if (le64toh(o->object.size) == osize &&
945 memcmp(o->data.payload, data, size) == 0) {
946
cec736d2
LP
947 if (ret)
948 *ret = o;
949
950 if (offset)
951 *offset = p;
952
de190aef 953 return 1;
cec736d2
LP
954 }
955
85a131e8 956 next:
cec736d2
LP
957 p = le64toh(o->data.next_hash_offset);
958 }
959
de190aef
LP
960 return 0;
961}
962
963int journal_file_find_data_object(
964 JournalFile *f,
965 const void *data, uint64_t size,
966 Object **ret, uint64_t *offset) {
967
968 uint64_t hash;
969
970 assert(f);
971 assert(data || size == 0);
972
973 hash = hash64(data, size);
974
975 return journal_file_find_data_object_with_hash(f,
976 data, size, hash,
977 ret, offset);
978}
979
3c1668da
LP
980static int journal_file_append_field(
981 JournalFile *f,
982 const void *field, uint64_t size,
983 Object **ret, uint64_t *offset) {
984
985 uint64_t hash, p;
986 uint64_t osize;
987 Object *o;
988 int r;
989
990 assert(f);
991 assert(field && size > 0);
992
993 hash = hash64(field, size);
994
995 r = journal_file_find_field_object_with_hash(f, field, size, hash, &o, &p);
996 if (r < 0)
997 return r;
998 else if (r > 0) {
999
1000 if (ret)
1001 *ret = o;
1002
1003 if (offset)
1004 *offset = p;
1005
1006 return 0;
1007 }
1008
1009 osize = offsetof(Object, field.payload) + size;
1010 r = journal_file_append_object(f, OBJECT_FIELD, osize, &o, &p);
8c92d4bb
LP
1011 if (r < 0)
1012 return r;
3c1668da
LP
1013
1014 o->field.hash = htole64(hash);
1015 memcpy(o->field.payload, field, size);
1016
1017 r = journal_file_link_field(f, o, p, hash);
1018 if (r < 0)
1019 return r;
1020
1021 /* The linking might have altered the window, so let's
1022 * refresh our pointer */
1023 r = journal_file_move_to_object(f, OBJECT_FIELD, p, &o);
1024 if (r < 0)
1025 return r;
1026
1027#ifdef HAVE_GCRYPT
1028 r = journal_file_hmac_put_object(f, OBJECT_FIELD, o, p);
1029 if (r < 0)
1030 return r;
1031#endif
1032
1033 if (ret)
1034 *ret = o;
1035
1036 if (offset)
1037 *offset = p;
1038
1039 return 0;
1040}
1041
48496df6
LP
1042static int journal_file_append_data(
1043 JournalFile *f,
1044 const void *data, uint64_t size,
1045 Object **ret, uint64_t *offset) {
1046
de190aef
LP
1047 uint64_t hash, p;
1048 uint64_t osize;
1049 Object *o;
d89c8fdf 1050 int r, compression = 0;
3c1668da 1051 const void *eq;
de190aef
LP
1052
1053 assert(f);
1054 assert(data || size == 0);
1055
1056 hash = hash64(data, size);
1057
1058 r = journal_file_find_data_object_with_hash(f, data, size, hash, &o, &p);
1059 if (r < 0)
1060 return r;
1061 else if (r > 0) {
1062
1063 if (ret)
1064 *ret = o;
1065
1066 if (offset)
1067 *offset = p;
1068
1069 return 0;
1070 }
1071
1072 osize = offsetof(Object, data.payload) + size;
1073 r = journal_file_append_object(f, OBJECT_DATA, osize, &o, &p);
cec736d2
LP
1074 if (r < 0)
1075 return r;
1076
cec736d2 1077 o->data.hash = htole64(hash);
807e17f0 1078
d89c8fdf
ZJS
1079#if defined(HAVE_XZ) || defined(HAVE_LZ4)
1080 if (f->compress_xz &&
807e17f0 1081 size >= COMPRESSION_SIZE_THRESHOLD) {
a7f7d1bd 1082 size_t rsize = 0;
807e17f0 1083
d89c8fdf 1084 compression = compress_blob(data, size, o->data.payload, &rsize);
807e17f0 1085
d89c8fdf 1086 if (compression) {
807e17f0 1087 o->object.size = htole64(offsetof(Object, data.payload) + rsize);
d89c8fdf 1088 o->object.flags |= compression;
807e17f0 1089
fa1c4b51 1090 log_debug("Compressed data object %"PRIu64" -> %zu using %s",
d89c8fdf 1091 size, rsize, object_compressed_to_string(compression));
807e17f0
LP
1092 }
1093 }
1094#endif
1095
d89c8fdf 1096 if (!compression && size > 0)
807e17f0 1097 memcpy(o->data.payload, data, size);
cec736d2 1098
de190aef 1099 r = journal_file_link_data(f, o, p, hash);
cec736d2
LP
1100 if (r < 0)
1101 return r;
1102
48496df6
LP
1103 /* The linking might have altered the window, so let's
1104 * refresh our pointer */
1105 r = journal_file_move_to_object(f, OBJECT_DATA, p, &o);
1106 if (r < 0)
1107 return r;
1108
08c6f819
SL
1109 if (!data)
1110 eq = NULL;
1111 else
1112 eq = memchr(data, '=', size);
3c1668da 1113 if (eq && eq > data) {
748db592 1114 Object *fo = NULL;
3c1668da 1115 uint64_t fp;
3c1668da
LP
1116
1117 /* Create field object ... */
1118 r = journal_file_append_field(f, data, (uint8_t*) eq - (uint8_t*) data, &fo, &fp);
1119 if (r < 0)
1120 return r;
1121
1122 /* ... and link it in. */
1123 o->data.next_field_offset = fo->field.head_data_offset;
1124 fo->field.head_data_offset = le64toh(p);
1125 }
1126
5996c7c2
LP
1127#ifdef HAVE_GCRYPT
1128 r = journal_file_hmac_put_object(f, OBJECT_DATA, o, p);
1129 if (r < 0)
1130 return r;
1131#endif
1132
cec736d2
LP
1133 if (ret)
1134 *ret = o;
1135
1136 if (offset)
de190aef 1137 *offset = p;
cec736d2
LP
1138
1139 return 0;
1140}
1141
1142uint64_t journal_file_entry_n_items(Object *o) {
1143 assert(o);
b588975f
LP
1144
1145 if (o->object.type != OBJECT_ENTRY)
1146 return 0;
cec736d2
LP
1147
1148 return (le64toh(o->object.size) - offsetof(Object, entry.items)) / sizeof(EntryItem);
1149}
1150
0284adc6 1151uint64_t journal_file_entry_array_n_items(Object *o) {
de190aef 1152 assert(o);
b588975f
LP
1153
1154 if (o->object.type != OBJECT_ENTRY_ARRAY)
1155 return 0;
de190aef
LP
1156
1157 return (le64toh(o->object.size) - offsetof(Object, entry_array.items)) / sizeof(uint64_t);
1158}
1159
fb9a24b6
LP
1160uint64_t journal_file_hash_table_n_items(Object *o) {
1161 assert(o);
b588975f
LP
1162
1163 if (o->object.type != OBJECT_DATA_HASH_TABLE &&
1164 o->object.type != OBJECT_FIELD_HASH_TABLE)
1165 return 0;
fb9a24b6
LP
1166
1167 return (le64toh(o->object.size) - offsetof(Object, hash_table.items)) / sizeof(HashItem);
1168}
1169
de190aef 1170static int link_entry_into_array(JournalFile *f,
4fd052ae
FC
1171 le64_t *first,
1172 le64_t *idx,
de190aef 1173 uint64_t p) {
cec736d2 1174 int r;
de190aef
LP
1175 uint64_t n = 0, ap = 0, q, i, a, hidx;
1176 Object *o;
1177
cec736d2 1178 assert(f);
de190aef
LP
1179 assert(first);
1180 assert(idx);
1181 assert(p > 0);
cec736d2 1182
de190aef
LP
1183 a = le64toh(*first);
1184 i = hidx = le64toh(*idx);
1185 while (a > 0) {
1186
1187 r = journal_file_move_to_object(f, OBJECT_ENTRY_ARRAY, a, &o);
1188 if (r < 0)
1189 return r;
cec736d2 1190
de190aef
LP
1191 n = journal_file_entry_array_n_items(o);
1192 if (i < n) {
1193 o->entry_array.items[i] = htole64(p);
1194 *idx = htole64(hidx + 1);
1195 return 0;
1196 }
cec736d2 1197
de190aef
LP
1198 i -= n;
1199 ap = a;
1200 a = le64toh(o->entry_array.next_entry_array_offset);
1201 }
1202
1203 if (hidx > n)
1204 n = (hidx+1) * 2;
1205 else
1206 n = n * 2;
1207
1208 if (n < 4)
1209 n = 4;
1210
1211 r = journal_file_append_object(f, OBJECT_ENTRY_ARRAY,
1212 offsetof(Object, entry_array.items) + n * sizeof(uint64_t),
1213 &o, &q);
cec736d2
LP
1214 if (r < 0)
1215 return r;
1216
feb12d3e 1217#ifdef HAVE_GCRYPT
5996c7c2 1218 r = journal_file_hmac_put_object(f, OBJECT_ENTRY_ARRAY, o, q);
b0af6f41
LP
1219 if (r < 0)
1220 return r;
feb12d3e 1221#endif
b0af6f41 1222
de190aef 1223 o->entry_array.items[i] = htole64(p);
cec736d2 1224
de190aef 1225 if (ap == 0)
7be3aa17 1226 *first = htole64(q);
cec736d2 1227 else {
de190aef 1228 r = journal_file_move_to_object(f, OBJECT_ENTRY_ARRAY, ap, &o);
cec736d2
LP
1229 if (r < 0)
1230 return r;
1231
de190aef
LP
1232 o->entry_array.next_entry_array_offset = htole64(q);
1233 }
cec736d2 1234
2dee23eb
LP
1235 if (JOURNAL_HEADER_CONTAINS(f->header, n_entry_arrays))
1236 f->header->n_entry_arrays = htole64(le64toh(f->header->n_entry_arrays) + 1);
1237
de190aef
LP
1238 *idx = htole64(hidx + 1);
1239
1240 return 0;
1241}
cec736d2 1242
de190aef 1243static int link_entry_into_array_plus_one(JournalFile *f,
4fd052ae
FC
1244 le64_t *extra,
1245 le64_t *first,
1246 le64_t *idx,
de190aef
LP
1247 uint64_t p) {
1248
1249 int r;
1250
1251 assert(f);
1252 assert(extra);
1253 assert(first);
1254 assert(idx);
1255 assert(p > 0);
1256
1257 if (*idx == 0)
1258 *extra = htole64(p);
1259 else {
4fd052ae 1260 le64_t i;
de190aef 1261
7be3aa17 1262 i = htole64(le64toh(*idx) - 1);
de190aef
LP
1263 r = link_entry_into_array(f, first, &i, p);
1264 if (r < 0)
1265 return r;
cec736d2
LP
1266 }
1267
de190aef
LP
1268 *idx = htole64(le64toh(*idx) + 1);
1269 return 0;
1270}
1271
1272static int journal_file_link_entry_item(JournalFile *f, Object *o, uint64_t offset, uint64_t i) {
1273 uint64_t p;
1274 int r;
1275 assert(f);
1276 assert(o);
1277 assert(offset > 0);
1278
1279 p = le64toh(o->entry.items[i].object_offset);
1280 if (p == 0)
1281 return -EINVAL;
1282
1283 r = journal_file_move_to_object(f, OBJECT_DATA, p, &o);
cec736d2
LP
1284 if (r < 0)
1285 return r;
1286
de190aef
LP
1287 return link_entry_into_array_plus_one(f,
1288 &o->data.entry_offset,
1289 &o->data.entry_array_offset,
1290 &o->data.n_entries,
1291 offset);
cec736d2
LP
1292}
1293
1294static int journal_file_link_entry(JournalFile *f, Object *o, uint64_t offset) {
de190aef 1295 uint64_t n, i;
cec736d2
LP
1296 int r;
1297
1298 assert(f);
1299 assert(o);
1300 assert(offset > 0);
b588975f
LP
1301
1302 if (o->object.type != OBJECT_ENTRY)
1303 return -EINVAL;
cec736d2 1304
b788cc23
LP
1305 __sync_synchronize();
1306
cec736d2 1307 /* Link up the entry itself */
de190aef
LP
1308 r = link_entry_into_array(f,
1309 &f->header->entry_array_offset,
1310 &f->header->n_entries,
1311 offset);
1312 if (r < 0)
1313 return r;
cec736d2 1314
507f22bd 1315 /* log_debug("=> %s seqnr=%"PRIu64" n_entries=%"PRIu64, f->path, o->entry.seqnum, f->header->n_entries); */
cec736d2 1316
de190aef 1317 if (f->header->head_entry_realtime == 0)
0ac38b70 1318 f->header->head_entry_realtime = o->entry.realtime;
cec736d2 1319
0ac38b70 1320 f->header->tail_entry_realtime = o->entry.realtime;
de190aef
LP
1321 f->header->tail_entry_monotonic = o->entry.monotonic;
1322
1323 f->tail_entry_monotonic_valid = true;
cec736d2
LP
1324
1325 /* Link up the items */
1326 n = journal_file_entry_n_items(o);
1327 for (i = 0; i < n; i++) {
1328 r = journal_file_link_entry_item(f, o, offset, i);
1329 if (r < 0)
1330 return r;
1331 }
1332
cec736d2
LP
1333 return 0;
1334}
1335
1336static int journal_file_append_entry_internal(
1337 JournalFile *f,
1338 const dual_timestamp *ts,
1339 uint64_t xor_hash,
1340 const EntryItem items[], unsigned n_items,
de190aef 1341 uint64_t *seqnum,
cec736d2
LP
1342 Object **ret, uint64_t *offset) {
1343 uint64_t np;
1344 uint64_t osize;
1345 Object *o;
1346 int r;
1347
1348 assert(f);
1349 assert(items || n_items == 0);
de190aef 1350 assert(ts);
cec736d2
LP
1351
1352 osize = offsetof(Object, entry.items) + (n_items * sizeof(EntryItem));
1353
de190aef 1354 r = journal_file_append_object(f, OBJECT_ENTRY, osize, &o, &np);
cec736d2
LP
1355 if (r < 0)
1356 return r;
1357
d98cc1f2 1358 o->entry.seqnum = htole64(journal_file_entry_seqnum(f, seqnum));
cec736d2 1359 memcpy(o->entry.items, items, n_items * sizeof(EntryItem));
de190aef
LP
1360 o->entry.realtime = htole64(ts->realtime);
1361 o->entry.monotonic = htole64(ts->monotonic);
cec736d2
LP
1362 o->entry.xor_hash = htole64(xor_hash);
1363 o->entry.boot_id = f->header->boot_id;
1364
feb12d3e 1365#ifdef HAVE_GCRYPT
5996c7c2 1366 r = journal_file_hmac_put_object(f, OBJECT_ENTRY, o, np);
b0af6f41
LP
1367 if (r < 0)
1368 return r;
feb12d3e 1369#endif
b0af6f41 1370
cec736d2
LP
1371 r = journal_file_link_entry(f, o, np);
1372 if (r < 0)
1373 return r;
1374
1375 if (ret)
1376 *ret = o;
1377
1378 if (offset)
1379 *offset = np;
1380
1381 return 0;
1382}
1383
cf244689 1384void journal_file_post_change(JournalFile *f) {
50f20cfd
LP
1385 assert(f);
1386
1387 /* inotify() does not receive IN_MODIFY events from file
1388 * accesses done via mmap(). After each access we hence
1389 * trigger IN_MODIFY by truncating the journal file to its
1390 * current size which triggers IN_MODIFY. */
1391
bc85bfee
LP
1392 __sync_synchronize();
1393
50f20cfd 1394 if (ftruncate(f->fd, f->last_stat.st_size) < 0)
56f64d95 1395 log_error_errno(errno, "Failed to truncate file to its own size: %m");
50f20cfd
LP
1396}
1397
1f2da9ec
LP
1398static int entry_item_cmp(const void *_a, const void *_b) {
1399 const EntryItem *a = _a, *b = _b;
1400
1401 if (le64toh(a->object_offset) < le64toh(b->object_offset))
1402 return -1;
1403 if (le64toh(a->object_offset) > le64toh(b->object_offset))
1404 return 1;
1405 return 0;
1406}
1407
de190aef 1408int journal_file_append_entry(JournalFile *f, const dual_timestamp *ts, const struct iovec iovec[], unsigned n_iovec, uint64_t *seqnum, Object **ret, uint64_t *offset) {
cec736d2
LP
1409 unsigned i;
1410 EntryItem *items;
1411 int r;
1412 uint64_t xor_hash = 0;
de190aef 1413 struct dual_timestamp _ts;
cec736d2
LP
1414
1415 assert(f);
1416 assert(iovec || n_iovec == 0);
1417
de190aef
LP
1418 if (!ts) {
1419 dual_timestamp_get(&_ts);
1420 ts = &_ts;
1421 }
1422
1423 if (f->tail_entry_monotonic_valid &&
1424 ts->monotonic < le64toh(f->header->tail_entry_monotonic))
1425 return -EINVAL;
1426
feb12d3e 1427#ifdef HAVE_GCRYPT
7560fffc
LP
1428 r = journal_file_maybe_append_tag(f, ts->realtime);
1429 if (r < 0)
1430 return r;
feb12d3e 1431#endif
7560fffc 1432
64825d3c 1433 /* alloca() can't take 0, hence let's allocate at least one */
9607d947 1434 items = alloca(sizeof(EntryItem) * MAX(1u, n_iovec));
cec736d2
LP
1435
1436 for (i = 0; i < n_iovec; i++) {
1437 uint64_t p;
1438 Object *o;
1439
1440 r = journal_file_append_data(f, iovec[i].iov_base, iovec[i].iov_len, &o, &p);
1441 if (r < 0)
cf244689 1442 return r;
cec736d2
LP
1443
1444 xor_hash ^= le64toh(o->data.hash);
1445 items[i].object_offset = htole64(p);
de7b95cd 1446 items[i].hash = o->data.hash;
cec736d2
LP
1447 }
1448
1f2da9ec
LP
1449 /* Order by the position on disk, in order to improve seek
1450 * times for rotating media. */
7ff7394d 1451 qsort_safe(items, n_iovec, sizeof(EntryItem), entry_item_cmp);
1f2da9ec 1452
de190aef 1453 r = journal_file_append_entry_internal(f, ts, xor_hash, items, n_iovec, seqnum, ret, offset);
cec736d2 1454
fa6ac760
LP
1455 /* If the memory mapping triggered a SIGBUS then we return an
1456 * IO error and ignore the error code passed down to us, since
1457 * it is very likely just an effect of a nullified replacement
1458 * mapping page */
1459
1460 if (mmap_cache_got_sigbus(f->mmap, f->fd))
1461 r = -EIO;
1462
50f20cfd
LP
1463 journal_file_post_change(f);
1464
cec736d2
LP
1465 return r;
1466}
1467
a4bcff5b 1468typedef struct ChainCacheItem {
fb099c8d 1469 uint64_t first; /* the array at the beginning of the chain */
a4bcff5b
LP
1470 uint64_t array; /* the cached array */
1471 uint64_t begin; /* the first item in the cached array */
1472 uint64_t total; /* the total number of items in all arrays before this one in the chain */
f268980d 1473 uint64_t last_index; /* the last index we looked at, to optimize locality when bisecting */
a4bcff5b
LP
1474} ChainCacheItem;
1475
1476static void chain_cache_put(
4743015d 1477 OrderedHashmap *h,
a4bcff5b
LP
1478 ChainCacheItem *ci,
1479 uint64_t first,
1480 uint64_t array,
1481 uint64_t begin,
f268980d
LP
1482 uint64_t total,
1483 uint64_t last_index) {
a4bcff5b
LP
1484
1485 if (!ci) {
34741aa3
LP
1486 /* If the chain item to cache for this chain is the
1487 * first one it's not worth caching anything */
1488 if (array == first)
1489 return;
1490
29433089 1491 if (ordered_hashmap_size(h) >= CHAIN_CACHE_MAX) {
4743015d 1492 ci = ordered_hashmap_steal_first(h);
29433089
LP
1493 assert(ci);
1494 } else {
a4bcff5b
LP
1495 ci = new(ChainCacheItem, 1);
1496 if (!ci)
1497 return;
1498 }
1499
1500 ci->first = first;
1501
4743015d 1502 if (ordered_hashmap_put(h, &ci->first, ci) < 0) {
a4bcff5b
LP
1503 free(ci);
1504 return;
1505 }
1506 } else
1507 assert(ci->first == first);
1508
1509 ci->array = array;
1510 ci->begin = begin;
1511 ci->total = total;
f268980d 1512 ci->last_index = last_index;
a4bcff5b
LP
1513}
1514
f268980d
LP
1515static int generic_array_get(
1516 JournalFile *f,
1517 uint64_t first,
1518 uint64_t i,
1519 Object **ret, uint64_t *offset) {
de190aef 1520
cec736d2 1521 Object *o;
a4bcff5b 1522 uint64_t p = 0, a, t = 0;
cec736d2 1523 int r;
a4bcff5b 1524 ChainCacheItem *ci;
cec736d2
LP
1525
1526 assert(f);
1527
de190aef 1528 a = first;
a4bcff5b
LP
1529
1530 /* Try the chain cache first */
4743015d 1531 ci = ordered_hashmap_get(f->chain_cache, &first);
a4bcff5b
LP
1532 if (ci && i > ci->total) {
1533 a = ci->array;
1534 i -= ci->total;
1535 t = ci->total;
1536 }
1537
de190aef 1538 while (a > 0) {
a4bcff5b 1539 uint64_t k;
cec736d2 1540
de190aef
LP
1541 r = journal_file_move_to_object(f, OBJECT_ENTRY_ARRAY, a, &o);
1542 if (r < 0)
1543 return r;
cec736d2 1544
a4bcff5b
LP
1545 k = journal_file_entry_array_n_items(o);
1546 if (i < k) {
de190aef 1547 p = le64toh(o->entry_array.items[i]);
a4bcff5b 1548 goto found;
cec736d2
LP
1549 }
1550
a4bcff5b
LP
1551 i -= k;
1552 t += k;
de190aef
LP
1553 a = le64toh(o->entry_array.next_entry_array_offset);
1554 }
1555
a4bcff5b
LP
1556 return 0;
1557
1558found:
1559 /* Let's cache this item for the next invocation */
af13a6b0 1560 chain_cache_put(f->chain_cache, ci, first, a, le64toh(o->entry_array.items[0]), t, i);
de190aef
LP
1561
1562 r = journal_file_move_to_object(f, OBJECT_ENTRY, p, &o);
1563 if (r < 0)
1564 return r;
1565
1566 if (ret)
1567 *ret = o;
1568
1569 if (offset)
1570 *offset = p;
1571
1572 return 1;
1573}
1574
f268980d
LP
1575static int generic_array_get_plus_one(
1576 JournalFile *f,
1577 uint64_t extra,
1578 uint64_t first,
1579 uint64_t i,
1580 Object **ret, uint64_t *offset) {
de190aef
LP
1581
1582 Object *o;
1583
1584 assert(f);
1585
1586 if (i == 0) {
1587 int r;
1588
1589 r = journal_file_move_to_object(f, OBJECT_ENTRY, extra, &o);
cec736d2
LP
1590 if (r < 0)
1591 return r;
1592
de190aef
LP
1593 if (ret)
1594 *ret = o;
cec736d2 1595
de190aef
LP
1596 if (offset)
1597 *offset = extra;
cec736d2 1598
de190aef 1599 return 1;
cec736d2
LP
1600 }
1601
de190aef
LP
1602 return generic_array_get(f, first, i-1, ret, offset);
1603}
cec736d2 1604
de190aef
LP
1605enum {
1606 TEST_FOUND,
1607 TEST_LEFT,
1608 TEST_RIGHT
1609};
cec736d2 1610
f268980d
LP
1611static int generic_array_bisect(
1612 JournalFile *f,
1613 uint64_t first,
1614 uint64_t n,
1615 uint64_t needle,
1616 int (*test_object)(JournalFile *f, uint64_t p, uint64_t needle),
1617 direction_t direction,
1618 Object **ret,
1619 uint64_t *offset,
1620 uint64_t *idx) {
1621
1622 uint64_t a, p, t = 0, i = 0, last_p = 0, last_index = (uint64_t) -1;
de190aef
LP
1623 bool subtract_one = false;
1624 Object *o, *array = NULL;
1625 int r;
a4bcff5b 1626 ChainCacheItem *ci;
cec736d2 1627
de190aef
LP
1628 assert(f);
1629 assert(test_object);
cec736d2 1630
a4bcff5b 1631 /* Start with the first array in the chain */
de190aef 1632 a = first;
a4bcff5b 1633
4743015d 1634 ci = ordered_hashmap_get(f->chain_cache, &first);
a4bcff5b
LP
1635 if (ci && n > ci->total) {
1636 /* Ah, we have iterated this bisection array chain
1637 * previously! Let's see if we can skip ahead in the
1638 * chain, as far as the last time. But we can't jump
1639 * backwards in the chain, so let's check that
1640 * first. */
1641
1642 r = test_object(f, ci->begin, needle);
1643 if (r < 0)
1644 return r;
1645
1646 if (r == TEST_LEFT) {
f268980d 1647 /* OK, what we are looking for is right of the
a4bcff5b
LP
1648 * begin of this EntryArray, so let's jump
1649 * straight to previously cached array in the
1650 * chain */
1651
1652 a = ci->array;
1653 n -= ci->total;
1654 t = ci->total;
f268980d 1655 last_index = ci->last_index;
a4bcff5b
LP
1656 }
1657 }
1658
de190aef
LP
1659 while (a > 0) {
1660 uint64_t left, right, k, lp;
1661
1662 r = journal_file_move_to_object(f, OBJECT_ENTRY_ARRAY, a, &array);
cec736d2
LP
1663 if (r < 0)
1664 return r;
1665
de190aef
LP
1666 k = journal_file_entry_array_n_items(array);
1667 right = MIN(k, n);
1668 if (right <= 0)
1669 return 0;
cec736d2 1670
de190aef
LP
1671 i = right - 1;
1672 lp = p = le64toh(array->entry_array.items[i]);
1673 if (p <= 0)
1674 return -EBADMSG;
cec736d2 1675
de190aef
LP
1676 r = test_object(f, p, needle);
1677 if (r < 0)
1678 return r;
cec736d2 1679
de190aef
LP
1680 if (r == TEST_FOUND)
1681 r = direction == DIRECTION_DOWN ? TEST_RIGHT : TEST_LEFT;
1682
1683 if (r == TEST_RIGHT) {
1684 left = 0;
1685 right -= 1;
f268980d
LP
1686
1687 if (last_index != (uint64_t) -1) {
1688 assert(last_index <= right);
1689
1690 /* If we cached the last index we
1691 * looked at, let's try to not to jump
1692 * too wildly around and see if we can
1693 * limit the range to look at early to
1694 * the immediate neighbors of the last
1695 * index we looked at. */
1696
1697 if (last_index > 0) {
1698 uint64_t x = last_index - 1;
1699
1700 p = le64toh(array->entry_array.items[x]);
1701 if (p <= 0)
1702 return -EBADMSG;
1703
1704 r = test_object(f, p, needle);
1705 if (r < 0)
1706 return r;
1707
1708 if (r == TEST_FOUND)
1709 r = direction == DIRECTION_DOWN ? TEST_RIGHT : TEST_LEFT;
1710
1711 if (r == TEST_RIGHT)
1712 right = x;
1713 else
1714 left = x + 1;
1715 }
1716
1717 if (last_index < right) {
1718 uint64_t y = last_index + 1;
1719
1720 p = le64toh(array->entry_array.items[y]);
1721 if (p <= 0)
1722 return -EBADMSG;
1723
1724 r = test_object(f, p, needle);
1725 if (r < 0)
1726 return r;
1727
1728 if (r == TEST_FOUND)
1729 r = direction == DIRECTION_DOWN ? TEST_RIGHT : TEST_LEFT;
1730
1731 if (r == TEST_RIGHT)
1732 right = y;
1733 else
1734 left = y + 1;
1735 }
f268980d
LP
1736 }
1737
de190aef
LP
1738 for (;;) {
1739 if (left == right) {
1740 if (direction == DIRECTION_UP)
1741 subtract_one = true;
1742
1743 i = left;
1744 goto found;
1745 }
1746
1747 assert(left < right);
de190aef 1748 i = (left + right) / 2;
f268980d 1749
de190aef
LP
1750 p = le64toh(array->entry_array.items[i]);
1751 if (p <= 0)
1752 return -EBADMSG;
1753
1754 r = test_object(f, p, needle);
1755 if (r < 0)
1756 return r;
cec736d2 1757
de190aef
LP
1758 if (r == TEST_FOUND)
1759 r = direction == DIRECTION_DOWN ? TEST_RIGHT : TEST_LEFT;
1760
1761 if (r == TEST_RIGHT)
1762 right = i;
1763 else
1764 left = i + 1;
1765 }
1766 }
1767
2173cbf8 1768 if (k >= n) {
cbdca852
LP
1769 if (direction == DIRECTION_UP) {
1770 i = n;
1771 subtract_one = true;
1772 goto found;
1773 }
1774
cec736d2 1775 return 0;
cbdca852 1776 }
cec736d2 1777
de190aef
LP
1778 last_p = lp;
1779
1780 n -= k;
1781 t += k;
f268980d 1782 last_index = (uint64_t) -1;
de190aef 1783 a = le64toh(array->entry_array.next_entry_array_offset);
cec736d2
LP
1784 }
1785
1786 return 0;
de190aef
LP
1787
1788found:
1789 if (subtract_one && t == 0 && i == 0)
1790 return 0;
1791
a4bcff5b 1792 /* Let's cache this item for the next invocation */
af13a6b0 1793 chain_cache_put(f->chain_cache, ci, first, a, le64toh(array->entry_array.items[0]), t, subtract_one ? (i > 0 ? i-1 : (uint64_t) -1) : i);
a4bcff5b 1794
de190aef
LP
1795 if (subtract_one && i == 0)
1796 p = last_p;
1797 else if (subtract_one)
1798 p = le64toh(array->entry_array.items[i-1]);
1799 else
1800 p = le64toh(array->entry_array.items[i]);
1801
1802 r = journal_file_move_to_object(f, OBJECT_ENTRY, p, &o);
1803 if (r < 0)
1804 return r;
1805
1806 if (ret)
1807 *ret = o;
1808
1809 if (offset)
1810 *offset = p;
1811
1812 if (idx)
cbdca852 1813 *idx = t + i + (subtract_one ? -1 : 0);
de190aef
LP
1814
1815 return 1;
cec736d2
LP
1816}
1817
f268980d
LP
1818static int generic_array_bisect_plus_one(
1819 JournalFile *f,
1820 uint64_t extra,
1821 uint64_t first,
1822 uint64_t n,
1823 uint64_t needle,
1824 int (*test_object)(JournalFile *f, uint64_t p, uint64_t needle),
1825 direction_t direction,
1826 Object **ret,
1827 uint64_t *offset,
1828 uint64_t *idx) {
de190aef 1829
cec736d2 1830 int r;
cbdca852
LP
1831 bool step_back = false;
1832 Object *o;
cec736d2
LP
1833
1834 assert(f);
de190aef 1835 assert(test_object);
cec736d2 1836
de190aef
LP
1837 if (n <= 0)
1838 return 0;
cec736d2 1839
de190aef
LP
1840 /* This bisects the array in object 'first', but first checks
1841 * an extra */
de190aef
LP
1842 r = test_object(f, extra, needle);
1843 if (r < 0)
1844 return r;
a536e261
LP
1845
1846 if (r == TEST_FOUND)
1847 r = direction == DIRECTION_DOWN ? TEST_RIGHT : TEST_LEFT;
1848
cbdca852
LP
1849 /* if we are looking with DIRECTION_UP then we need to first
1850 see if in the actual array there is a matching entry, and
1851 return the last one of that. But if there isn't any we need
1852 to return this one. Hence remember this, and return it
1853 below. */
1854 if (r == TEST_LEFT)
1855 step_back = direction == DIRECTION_UP;
de190aef 1856
cbdca852
LP
1857 if (r == TEST_RIGHT) {
1858 if (direction == DIRECTION_DOWN)
1859 goto found;
1860 else
1861 return 0;
a536e261 1862 }
cec736d2 1863
de190aef
LP
1864 r = generic_array_bisect(f, first, n-1, needle, test_object, direction, ret, offset, idx);
1865
cbdca852
LP
1866 if (r == 0 && step_back)
1867 goto found;
1868
ecf68b1d 1869 if (r > 0 && idx)
de190aef
LP
1870 (*idx) ++;
1871
1872 return r;
cbdca852
LP
1873
1874found:
1875 r = journal_file_move_to_object(f, OBJECT_ENTRY, extra, &o);
1876 if (r < 0)
1877 return r;
1878
1879 if (ret)
1880 *ret = o;
1881
1882 if (offset)
1883 *offset = extra;
1884
1885 if (idx)
1886 *idx = 0;
1887
1888 return 1;
1889}
1890
44a6b1b6 1891_pure_ static int test_object_offset(JournalFile *f, uint64_t p, uint64_t needle) {
cbdca852
LP
1892 assert(f);
1893 assert(p > 0);
1894
1895 if (p == needle)
1896 return TEST_FOUND;
1897 else if (p < needle)
1898 return TEST_LEFT;
1899 else
1900 return TEST_RIGHT;
1901}
1902
de190aef
LP
1903static int test_object_seqnum(JournalFile *f, uint64_t p, uint64_t needle) {
1904 Object *o;
1905 int r;
1906
1907 assert(f);
1908 assert(p > 0);
1909
1910 r = journal_file_move_to_object(f, OBJECT_ENTRY, p, &o);
cec736d2
LP
1911 if (r < 0)
1912 return r;
1913
de190aef
LP
1914 if (le64toh(o->entry.seqnum) == needle)
1915 return TEST_FOUND;
1916 else if (le64toh(o->entry.seqnum) < needle)
1917 return TEST_LEFT;
1918 else
1919 return TEST_RIGHT;
1920}
cec736d2 1921
de190aef
LP
1922int journal_file_move_to_entry_by_seqnum(
1923 JournalFile *f,
1924 uint64_t seqnum,
1925 direction_t direction,
1926 Object **ret,
1927 uint64_t *offset) {
1928
1929 return generic_array_bisect(f,
1930 le64toh(f->header->entry_array_offset),
1931 le64toh(f->header->n_entries),
1932 seqnum,
1933 test_object_seqnum,
1934 direction,
1935 ret, offset, NULL);
1936}
cec736d2 1937
de190aef
LP
1938static int test_object_realtime(JournalFile *f, uint64_t p, uint64_t needle) {
1939 Object *o;
1940 int r;
1941
1942 assert(f);
1943 assert(p > 0);
1944
1945 r = journal_file_move_to_object(f, OBJECT_ENTRY, p, &o);
1946 if (r < 0)
1947 return r;
1948
1949 if (le64toh(o->entry.realtime) == needle)
1950 return TEST_FOUND;
1951 else if (le64toh(o->entry.realtime) < needle)
1952 return TEST_LEFT;
1953 else
1954 return TEST_RIGHT;
cec736d2
LP
1955}
1956
de190aef
LP
1957int journal_file_move_to_entry_by_realtime(
1958 JournalFile *f,
1959 uint64_t realtime,
1960 direction_t direction,
1961 Object **ret,
1962 uint64_t *offset) {
1963
1964 return generic_array_bisect(f,
1965 le64toh(f->header->entry_array_offset),
1966 le64toh(f->header->n_entries),
1967 realtime,
1968 test_object_realtime,
1969 direction,
1970 ret, offset, NULL);
1971}
1972
1973static int test_object_monotonic(JournalFile *f, uint64_t p, uint64_t needle) {
1974 Object *o;
1975 int r;
1976
1977 assert(f);
1978 assert(p > 0);
1979
1980 r = journal_file_move_to_object(f, OBJECT_ENTRY, p, &o);
1981 if (r < 0)
1982 return r;
1983
1984 if (le64toh(o->entry.monotonic) == needle)
1985 return TEST_FOUND;
1986 else if (le64toh(o->entry.monotonic) < needle)
1987 return TEST_LEFT;
1988 else
1989 return TEST_RIGHT;
1990}
1991
2a560338 1992static int find_data_object_by_boot_id(
47838ab3
ZJS
1993 JournalFile *f,
1994 sd_id128_t boot_id,
1995 Object **o,
1996 uint64_t *b) {
2a560338 1997
47838ab3
ZJS
1998 char t[sizeof("_BOOT_ID=")-1 + 32 + 1] = "_BOOT_ID=";
1999
2000 sd_id128_to_string(boot_id, t + 9);
2001 return journal_file_find_data_object(f, t, sizeof(t) - 1, o, b);
2002}
2003
de190aef
LP
2004int journal_file_move_to_entry_by_monotonic(
2005 JournalFile *f,
2006 sd_id128_t boot_id,
2007 uint64_t monotonic,
2008 direction_t direction,
2009 Object **ret,
2010 uint64_t *offset) {
2011
de190aef
LP
2012 Object *o;
2013 int r;
2014
cbdca852 2015 assert(f);
de190aef 2016
47838ab3 2017 r = find_data_object_by_boot_id(f, boot_id, &o, NULL);
de190aef
LP
2018 if (r < 0)
2019 return r;
cbdca852 2020 if (r == 0)
de190aef
LP
2021 return -ENOENT;
2022
2023 return generic_array_bisect_plus_one(f,
2024 le64toh(o->data.entry_offset),
2025 le64toh(o->data.entry_array_offset),
2026 le64toh(o->data.n_entries),
2027 monotonic,
2028 test_object_monotonic,
2029 direction,
2030 ret, offset, NULL);
2031}
2032
1fc605b0 2033void journal_file_reset_location(JournalFile *f) {
6573ef05 2034 f->location_type = LOCATION_HEAD;
1fc605b0 2035 f->current_offset = 0;
6573ef05
MS
2036 f->current_seqnum = 0;
2037 f->current_realtime = 0;
2038 f->current_monotonic = 0;
2039 zero(f->current_boot_id);
2040 f->current_xor_hash = 0;
2041}
2042
950c07d4 2043void journal_file_save_location(JournalFile *f, Object *o, uint64_t offset) {
6573ef05
MS
2044 f->location_type = LOCATION_SEEK;
2045 f->current_offset = offset;
2046 f->current_seqnum = le64toh(o->entry.seqnum);
2047 f->current_realtime = le64toh(o->entry.realtime);
2048 f->current_monotonic = le64toh(o->entry.monotonic);
2049 f->current_boot_id = o->entry.boot_id;
2050 f->current_xor_hash = le64toh(o->entry.xor_hash);
1fc605b0
MS
2051}
2052
d8ae66d7
MS
2053int journal_file_compare_locations(JournalFile *af, JournalFile *bf) {
2054 assert(af);
2055 assert(bf);
2056 assert(af->location_type == LOCATION_SEEK);
2057 assert(bf->location_type == LOCATION_SEEK);
2058
2059 /* If contents and timestamps match, these entries are
2060 * identical, even if the seqnum does not match */
2061 if (sd_id128_equal(af->current_boot_id, bf->current_boot_id) &&
2062 af->current_monotonic == bf->current_monotonic &&
2063 af->current_realtime == bf->current_realtime &&
2064 af->current_xor_hash == bf->current_xor_hash)
2065 return 0;
2066
2067 if (sd_id128_equal(af->header->seqnum_id, bf->header->seqnum_id)) {
2068
2069 /* If this is from the same seqnum source, compare
2070 * seqnums */
2071 if (af->current_seqnum < bf->current_seqnum)
2072 return -1;
2073 if (af->current_seqnum > bf->current_seqnum)
2074 return 1;
2075
2076 /* Wow! This is weird, different data but the same
2077 * seqnums? Something is borked, but let's make the
2078 * best of it and compare by time. */
2079 }
2080
2081 if (sd_id128_equal(af->current_boot_id, bf->current_boot_id)) {
2082
2083 /* If the boot id matches, compare monotonic time */
2084 if (af->current_monotonic < bf->current_monotonic)
2085 return -1;
2086 if (af->current_monotonic > bf->current_monotonic)
2087 return 1;
2088 }
2089
2090 /* Otherwise, compare UTC time */
2091 if (af->current_realtime < bf->current_realtime)
2092 return -1;
2093 if (af->current_realtime > bf->current_realtime)
2094 return 1;
2095
2096 /* Finally, compare by contents */
2097 if (af->current_xor_hash < bf->current_xor_hash)
2098 return -1;
2099 if (af->current_xor_hash > bf->current_xor_hash)
2100 return 1;
2101
2102 return 0;
2103}
2104
de190aef
LP
2105int journal_file_next_entry(
2106 JournalFile *f,
f534928a 2107 uint64_t p,
de190aef
LP
2108 direction_t direction,
2109 Object **ret, uint64_t *offset) {
2110
fb099c8d 2111 uint64_t i, n, ofs;
cec736d2
LP
2112 int r;
2113
2114 assert(f);
de190aef
LP
2115
2116 n = le64toh(f->header->n_entries);
2117 if (n <= 0)
2118 return 0;
cec736d2 2119
f534928a 2120 if (p == 0)
de190aef 2121 i = direction == DIRECTION_DOWN ? 0 : n - 1;
cec736d2 2122 else {
de190aef
LP
2123 r = generic_array_bisect(f,
2124 le64toh(f->header->entry_array_offset),
2125 le64toh(f->header->n_entries),
2126 p,
2127 test_object_offset,
2128 DIRECTION_DOWN,
2129 NULL, NULL,
2130 &i);
2131 if (r <= 0)
2132 return r;
2133
2134 if (direction == DIRECTION_DOWN) {
2135 if (i >= n - 1)
2136 return 0;
2137
2138 i++;
2139 } else {
2140 if (i <= 0)
2141 return 0;
2142
2143 i--;
2144 }
cec736d2
LP
2145 }
2146
de190aef 2147 /* And jump to it */
fb099c8d
ZJS
2148 r = generic_array_get(f,
2149 le64toh(f->header->entry_array_offset),
2150 i,
2151 ret, &ofs);
2152 if (r <= 0)
2153 return r;
2154
2155 if (p > 0 &&
2156 (direction == DIRECTION_DOWN ? ofs <= p : ofs >= p)) {
2157 log_debug("%s: entry array corrupted at entry %"PRIu64,
2158 f->path, i);
2159 return -EBADMSG;
2160 }
2161
2162 if (offset)
2163 *offset = ofs;
2164
2165 return 1;
de190aef 2166}
cec736d2 2167
de190aef
LP
2168int journal_file_next_entry_for_data(
2169 JournalFile *f,
2170 Object *o, uint64_t p,
2171 uint64_t data_offset,
2172 direction_t direction,
2173 Object **ret, uint64_t *offset) {
2174
2175 uint64_t n, i;
cec736d2 2176 int r;
de190aef 2177 Object *d;
cec736d2
LP
2178
2179 assert(f);
de190aef 2180 assert(p > 0 || !o);
cec736d2 2181
de190aef 2182 r = journal_file_move_to_object(f, OBJECT_DATA, data_offset, &d);
466ccd92 2183 if (r < 0)
de190aef 2184 return r;
cec736d2 2185
de190aef
LP
2186 n = le64toh(d->data.n_entries);
2187 if (n <= 0)
2188 return n;
cec736d2 2189
de190aef
LP
2190 if (!o)
2191 i = direction == DIRECTION_DOWN ? 0 : n - 1;
2192 else {
2193 if (o->object.type != OBJECT_ENTRY)
2194 return -EINVAL;
cec736d2 2195
de190aef
LP
2196 r = generic_array_bisect_plus_one(f,
2197 le64toh(d->data.entry_offset),
2198 le64toh(d->data.entry_array_offset),
2199 le64toh(d->data.n_entries),
2200 p,
2201 test_object_offset,
2202 DIRECTION_DOWN,
2203 NULL, NULL,
2204 &i);
2205
2206 if (r <= 0)
cec736d2
LP
2207 return r;
2208
de190aef
LP
2209 if (direction == DIRECTION_DOWN) {
2210 if (i >= n - 1)
2211 return 0;
cec736d2 2212
de190aef
LP
2213 i++;
2214 } else {
2215 if (i <= 0)
2216 return 0;
cec736d2 2217
de190aef
LP
2218 i--;
2219 }
cec736d2 2220
de190aef 2221 }
cec736d2 2222
de190aef
LP
2223 return generic_array_get_plus_one(f,
2224 le64toh(d->data.entry_offset),
2225 le64toh(d->data.entry_array_offset),
2226 i,
2227 ret, offset);
2228}
cec736d2 2229
cbdca852
LP
2230int journal_file_move_to_entry_by_offset_for_data(
2231 JournalFile *f,
2232 uint64_t data_offset,
2233 uint64_t p,
2234 direction_t direction,
2235 Object **ret, uint64_t *offset) {
2236
2237 int r;
2238 Object *d;
2239
2240 assert(f);
2241
2242 r = journal_file_move_to_object(f, OBJECT_DATA, data_offset, &d);
2243 if (r < 0)
2244 return r;
2245
2246 return generic_array_bisect_plus_one(f,
2247 le64toh(d->data.entry_offset),
2248 le64toh(d->data.entry_array_offset),
2249 le64toh(d->data.n_entries),
2250 p,
2251 test_object_offset,
2252 direction,
2253 ret, offset, NULL);
2254}
2255
2256int journal_file_move_to_entry_by_monotonic_for_data(
2257 JournalFile *f,
2258 uint64_t data_offset,
2259 sd_id128_t boot_id,
2260 uint64_t monotonic,
2261 direction_t direction,
2262 Object **ret, uint64_t *offset) {
2263
cbdca852
LP
2264 Object *o, *d;
2265 int r;
2266 uint64_t b, z;
2267
2268 assert(f);
2269
2270 /* First, seek by time */
47838ab3 2271 r = find_data_object_by_boot_id(f, boot_id, &o, &b);
cbdca852
LP
2272 if (r < 0)
2273 return r;
2274 if (r == 0)
2275 return -ENOENT;
2276
2277 r = generic_array_bisect_plus_one(f,
2278 le64toh(o->data.entry_offset),
2279 le64toh(o->data.entry_array_offset),
2280 le64toh(o->data.n_entries),
2281 monotonic,
2282 test_object_monotonic,
2283 direction,
2284 NULL, &z, NULL);
2285 if (r <= 0)
2286 return r;
2287
2288 /* And now, continue seeking until we find an entry that
2289 * exists in both bisection arrays */
2290
2291 for (;;) {
2292 Object *qo;
2293 uint64_t p, q;
2294
2295 r = journal_file_move_to_object(f, OBJECT_DATA, data_offset, &d);
2296 if (r < 0)
2297 return r;
2298
2299 r = generic_array_bisect_plus_one(f,
2300 le64toh(d->data.entry_offset),
2301 le64toh(d->data.entry_array_offset),
2302 le64toh(d->data.n_entries),
2303 z,
2304 test_object_offset,
2305 direction,
2306 NULL, &p, NULL);
2307 if (r <= 0)
2308 return r;
2309
2310 r = journal_file_move_to_object(f, OBJECT_DATA, b, &o);
2311 if (r < 0)
2312 return r;
2313
2314 r = generic_array_bisect_plus_one(f,
2315 le64toh(o->data.entry_offset),
2316 le64toh(o->data.entry_array_offset),
2317 le64toh(o->data.n_entries),
2318 p,
2319 test_object_offset,
2320 direction,
2321 &qo, &q, NULL);
2322
2323 if (r <= 0)
2324 return r;
2325
2326 if (p == q) {
2327 if (ret)
2328 *ret = qo;
2329 if (offset)
2330 *offset = q;
2331
2332 return 1;
2333 }
2334
2335 z = q;
2336 }
cbdca852
LP
2337}
2338
de190aef
LP
2339int journal_file_move_to_entry_by_seqnum_for_data(
2340 JournalFile *f,
2341 uint64_t data_offset,
2342 uint64_t seqnum,
2343 direction_t direction,
2344 Object **ret, uint64_t *offset) {
cec736d2 2345
de190aef
LP
2346 Object *d;
2347 int r;
cec736d2 2348
91a31dde
LP
2349 assert(f);
2350
de190aef 2351 r = journal_file_move_to_object(f, OBJECT_DATA, data_offset, &d);
91a31dde 2352 if (r < 0)
de190aef 2353 return r;
cec736d2 2354
de190aef
LP
2355 return generic_array_bisect_plus_one(f,
2356 le64toh(d->data.entry_offset),
2357 le64toh(d->data.entry_array_offset),
2358 le64toh(d->data.n_entries),
2359 seqnum,
2360 test_object_seqnum,
2361 direction,
2362 ret, offset, NULL);
2363}
cec736d2 2364
de190aef
LP
2365int journal_file_move_to_entry_by_realtime_for_data(
2366 JournalFile *f,
2367 uint64_t data_offset,
2368 uint64_t realtime,
2369 direction_t direction,
2370 Object **ret, uint64_t *offset) {
2371
2372 Object *d;
2373 int r;
2374
91a31dde
LP
2375 assert(f);
2376
de190aef 2377 r = journal_file_move_to_object(f, OBJECT_DATA, data_offset, &d);
91a31dde 2378 if (r < 0)
de190aef
LP
2379 return r;
2380
2381 return generic_array_bisect_plus_one(f,
2382 le64toh(d->data.entry_offset),
2383 le64toh(d->data.entry_array_offset),
2384 le64toh(d->data.n_entries),
2385 realtime,
2386 test_object_realtime,
2387 direction,
2388 ret, offset, NULL);
cec736d2
LP
2389}
2390
0284adc6 2391void journal_file_dump(JournalFile *f) {
7560fffc 2392 Object *o;
7560fffc 2393 int r;
0284adc6 2394 uint64_t p;
7560fffc
LP
2395
2396 assert(f);
2397
0284adc6 2398 journal_file_print_header(f);
7560fffc 2399
0284adc6
LP
2400 p = le64toh(f->header->header_size);
2401 while (p != 0) {
d05089d8 2402 r = journal_file_move_to_object(f, OBJECT_UNUSED, p, &o);
0284adc6
LP
2403 if (r < 0)
2404 goto fail;
7560fffc 2405
0284adc6 2406 switch (o->object.type) {
d98cc1f2 2407
0284adc6
LP
2408 case OBJECT_UNUSED:
2409 printf("Type: OBJECT_UNUSED\n");
2410 break;
d98cc1f2 2411
0284adc6
LP
2412 case OBJECT_DATA:
2413 printf("Type: OBJECT_DATA\n");
2414 break;
7560fffc 2415
3c1668da
LP
2416 case OBJECT_FIELD:
2417 printf("Type: OBJECT_FIELD\n");
2418 break;
2419
0284adc6 2420 case OBJECT_ENTRY:
507f22bd
ZJS
2421 printf("Type: OBJECT_ENTRY seqnum=%"PRIu64" monotonic=%"PRIu64" realtime=%"PRIu64"\n",
2422 le64toh(o->entry.seqnum),
2423 le64toh(o->entry.monotonic),
2424 le64toh(o->entry.realtime));
0284adc6 2425 break;
7560fffc 2426
0284adc6
LP
2427 case OBJECT_FIELD_HASH_TABLE:
2428 printf("Type: OBJECT_FIELD_HASH_TABLE\n");
2429 break;
7560fffc 2430
0284adc6
LP
2431 case OBJECT_DATA_HASH_TABLE:
2432 printf("Type: OBJECT_DATA_HASH_TABLE\n");
2433 break;
7560fffc 2434
0284adc6
LP
2435 case OBJECT_ENTRY_ARRAY:
2436 printf("Type: OBJECT_ENTRY_ARRAY\n");
2437 break;
7560fffc 2438
0284adc6 2439 case OBJECT_TAG:
507f22bd
ZJS
2440 printf("Type: OBJECT_TAG seqnum=%"PRIu64" epoch=%"PRIu64"\n",
2441 le64toh(o->tag.seqnum),
2442 le64toh(o->tag.epoch));
0284adc6 2443 break;
3c1668da
LP
2444
2445 default:
8facc349 2446 printf("Type: unknown (%i)\n", o->object.type);
3c1668da 2447 break;
0284adc6 2448 }
7560fffc 2449
d89c8fdf
ZJS
2450 if (o->object.flags & OBJECT_COMPRESSION_MASK)
2451 printf("Flags: %s\n",
2452 object_compressed_to_string(o->object.flags & OBJECT_COMPRESSION_MASK));
7560fffc 2453
0284adc6
LP
2454 if (p == le64toh(f->header->tail_object_offset))
2455 p = 0;
2456 else
2457 p = p + ALIGN64(le64toh(o->object.size));
2458 }
7560fffc 2459
0284adc6
LP
2460 return;
2461fail:
2462 log_error("File corrupt");
7560fffc
LP
2463}
2464
718fe4b1
ZJS
2465static const char* format_timestamp_safe(char *buf, size_t l, usec_t t) {
2466 const char *x;
2467
2468 x = format_timestamp(buf, l, t);
2469 if (x)
2470 return x;
2471 return " --- ";
2472}
2473
0284adc6 2474void journal_file_print_header(JournalFile *f) {
2765b7bb 2475 char a[33], b[33], c[33], d[33];
ed375beb 2476 char x[FORMAT_TIMESTAMP_MAX], y[FORMAT_TIMESTAMP_MAX], z[FORMAT_TIMESTAMP_MAX];
a1a03e30
LP
2477 struct stat st;
2478 char bytes[FORMAT_BYTES_MAX];
7560fffc
LP
2479
2480 assert(f);
7560fffc 2481
0284adc6
LP
2482 printf("File Path: %s\n"
2483 "File ID: %s\n"
2484 "Machine ID: %s\n"
2485 "Boot ID: %s\n"
2486 "Sequential Number ID: %s\n"
2487 "State: %s\n"
2488 "Compatible Flags:%s%s\n"
d89c8fdf 2489 "Incompatible Flags:%s%s%s\n"
507f22bd
ZJS
2490 "Header size: %"PRIu64"\n"
2491 "Arena size: %"PRIu64"\n"
2492 "Data Hash Table Size: %"PRIu64"\n"
2493 "Field Hash Table Size: %"PRIu64"\n"
0284adc6 2494 "Rotate Suggested: %s\n"
507f22bd
ZJS
2495 "Head Sequential Number: %"PRIu64"\n"
2496 "Tail Sequential Number: %"PRIu64"\n"
0284adc6 2497 "Head Realtime Timestamp: %s\n"
3223f44f 2498 "Tail Realtime Timestamp: %s\n"
ed375beb 2499 "Tail Monotonic Timestamp: %s\n"
507f22bd
ZJS
2500 "Objects: %"PRIu64"\n"
2501 "Entry Objects: %"PRIu64"\n",
0284adc6
LP
2502 f->path,
2503 sd_id128_to_string(f->header->file_id, a),
2504 sd_id128_to_string(f->header->machine_id, b),
2505 sd_id128_to_string(f->header->boot_id, c),
2765b7bb 2506 sd_id128_to_string(f->header->seqnum_id, d),
3223f44f
LP
2507 f->header->state == STATE_OFFLINE ? "OFFLINE" :
2508 f->header->state == STATE_ONLINE ? "ONLINE" :
2509 f->header->state == STATE_ARCHIVED ? "ARCHIVED" : "UNKNOWN",
8088cbd3 2510 JOURNAL_HEADER_SEALED(f->header) ? " SEALED" : "",
d89c8fdf
ZJS
2511 (le32toh(f->header->compatible_flags) & ~HEADER_COMPATIBLE_ANY) ? " ???" : "",
2512 JOURNAL_HEADER_COMPRESSED_XZ(f->header) ? " COMPRESSED-XZ" : "",
2513 JOURNAL_HEADER_COMPRESSED_LZ4(f->header) ? " COMPRESSED-LZ4" : "",
2514 (le32toh(f->header->incompatible_flags) & ~HEADER_INCOMPATIBLE_ANY) ? " ???" : "",
507f22bd
ZJS
2515 le64toh(f->header->header_size),
2516 le64toh(f->header->arena_size),
2517 le64toh(f->header->data_hash_table_size) / sizeof(HashItem),
2518 le64toh(f->header->field_hash_table_size) / sizeof(HashItem),
fb0951b0 2519 yes_no(journal_file_rotate_suggested(f, 0)),
507f22bd
ZJS
2520 le64toh(f->header->head_entry_seqnum),
2521 le64toh(f->header->tail_entry_seqnum),
718fe4b1
ZJS
2522 format_timestamp_safe(x, sizeof(x), le64toh(f->header->head_entry_realtime)),
2523 format_timestamp_safe(y, sizeof(y), le64toh(f->header->tail_entry_realtime)),
ed375beb 2524 format_timespan(z, sizeof(z), le64toh(f->header->tail_entry_monotonic), USEC_PER_MSEC),
507f22bd
ZJS
2525 le64toh(f->header->n_objects),
2526 le64toh(f->header->n_entries));
7560fffc 2527
0284adc6 2528 if (JOURNAL_HEADER_CONTAINS(f->header, n_data))
507f22bd 2529 printf("Data Objects: %"PRIu64"\n"
0284adc6 2530 "Data Hash Table Fill: %.1f%%\n",
507f22bd 2531 le64toh(f->header->n_data),
0284adc6 2532 100.0 * (double) le64toh(f->header->n_data) / ((double) (le64toh(f->header->data_hash_table_size) / sizeof(HashItem))));
7560fffc 2533
0284adc6 2534 if (JOURNAL_HEADER_CONTAINS(f->header, n_fields))
507f22bd 2535 printf("Field Objects: %"PRIu64"\n"
0284adc6 2536 "Field Hash Table Fill: %.1f%%\n",
507f22bd 2537 le64toh(f->header->n_fields),
0284adc6 2538 100.0 * (double) le64toh(f->header->n_fields) / ((double) (le64toh(f->header->field_hash_table_size) / sizeof(HashItem))));
3223f44f
LP
2539
2540 if (JOURNAL_HEADER_CONTAINS(f->header, n_tags))
507f22bd
ZJS
2541 printf("Tag Objects: %"PRIu64"\n",
2542 le64toh(f->header->n_tags));
3223f44f 2543 if (JOURNAL_HEADER_CONTAINS(f->header, n_entry_arrays))
507f22bd
ZJS
2544 printf("Entry Array Objects: %"PRIu64"\n",
2545 le64toh(f->header->n_entry_arrays));
a1a03e30
LP
2546
2547 if (fstat(f->fd, &st) >= 0)
59f448cf 2548 printf("Disk usage: %s\n", format_bytes(bytes, sizeof(bytes), (uint64_t) st.st_blocks * 512ULL));
7560fffc
LP
2549}
2550
fc68c929
LP
2551static int journal_file_warn_btrfs(JournalFile *f) {
2552 unsigned attrs;
2553 int r;
2554
2555 assert(f);
2556
2557 /* Before we write anything, check if the COW logic is turned
2558 * off on btrfs. Given our write pattern that is quite
2559 * unfriendly to COW file systems this should greatly improve
2560 * performance on COW file systems, such as btrfs, at the
2561 * expense of data integrity features (which shouldn't be too
2562 * bad, given that we do our own checksumming). */
2563
2564 r = btrfs_is_filesystem(f->fd);
2565 if (r < 0)
2566 return log_warning_errno(r, "Failed to determine if journal is on btrfs: %m");
2567 if (!r)
2568 return 0;
2569
2570 r = read_attr_fd(f->fd, &attrs);
2571 if (r < 0)
2572 return log_warning_errno(r, "Failed to read file attributes: %m");
2573
2574 if (attrs & FS_NOCOW_FL) {
2575 log_debug("Detected btrfs file system with copy-on-write disabled, all is good.");
2576 return 0;
2577 }
2578
2579 log_notice("Creating journal file %s on a btrfs file system, and copy-on-write is enabled. "
2580 "This is likely to slow down journal access substantially, please consider turning "
2581 "off the copy-on-write file attribute on the journal directory, using chattr +C.", f->path);
2582
2583 return 1;
2584}
2585
0284adc6
LP
2586int journal_file_open(
2587 const char *fname,
2588 int flags,
2589 mode_t mode,
2590 bool compress,
baed47c3 2591 bool seal,
0284adc6
LP
2592 JournalMetrics *metrics,
2593 MMapCache *mmap_cache,
2594 JournalFile *template,
2595 JournalFile **ret) {
7560fffc 2596
fa6ac760 2597 bool newly_created = false;
0284adc6 2598 JournalFile *f;
fa6ac760 2599 void *h;
0284adc6 2600 int r;
7560fffc 2601
0284adc6 2602 assert(fname);
0559d3a5 2603 assert(ret);
7560fffc 2604
0284adc6
LP
2605 if ((flags & O_ACCMODE) != O_RDONLY &&
2606 (flags & O_ACCMODE) != O_RDWR)
2607 return -EINVAL;
7560fffc 2608
a0108012
LP
2609 if (!endswith(fname, ".journal") &&
2610 !endswith(fname, ".journal~"))
0284adc6 2611 return -EINVAL;
7560fffc 2612
0284adc6
LP
2613 f = new0(JournalFile, 1);
2614 if (!f)
2615 return -ENOMEM;
7560fffc 2616
0284adc6
LP
2617 f->fd = -1;
2618 f->mode = mode;
7560fffc 2619
0284adc6
LP
2620 f->flags = flags;
2621 f->prot = prot_from_flags(flags);
2622 f->writable = (flags & O_ACCMODE) != O_RDONLY;
92261977 2623#if defined(HAVE_LZ4)
d89c8fdf
ZJS
2624 f->compress_lz4 = compress;
2625#elif defined(HAVE_XZ)
2626 f->compress_xz = compress;
48b61739 2627#endif
49a32d43 2628#ifdef HAVE_GCRYPT
baed47c3 2629 f->seal = seal;
49a32d43 2630#endif
7560fffc 2631
0284adc6
LP
2632 if (mmap_cache)
2633 f->mmap = mmap_cache_ref(mmap_cache);
2634 else {
84168d80 2635 f->mmap = mmap_cache_new();
0284adc6
LP
2636 if (!f->mmap) {
2637 r = -ENOMEM;
2638 goto fail;
2639 }
2640 }
7560fffc 2641
0284adc6
LP
2642 f->path = strdup(fname);
2643 if (!f->path) {
2644 r = -ENOMEM;
2645 goto fail;
2646 }
7560fffc 2647
4743015d 2648 f->chain_cache = ordered_hashmap_new(&uint64_hash_ops);
a4bcff5b
LP
2649 if (!f->chain_cache) {
2650 r = -ENOMEM;
2651 goto fail;
2652 }
2653
0284adc6
LP
2654 f->fd = open(f->path, f->flags|O_CLOEXEC, f->mode);
2655 if (f->fd < 0) {
2656 r = -errno;
2657 goto fail;
7560fffc 2658 }
7560fffc 2659
2678031a
LP
2660 r = journal_file_fstat(f);
2661 if (r < 0)
0284adc6 2662 goto fail;
7560fffc 2663
0284adc6 2664 if (f->last_stat.st_size == 0 && f->writable) {
11689d2a 2665
fc68c929 2666 (void) journal_file_warn_btrfs(f);
11689d2a 2667
fb0951b0
LP
2668 /* Let's attach the creation time to the journal file,
2669 * so that the vacuuming code knows the age of this
2670 * file even if the file might end up corrupted one
2671 * day... Ideally we'd just use the creation time many
2672 * file systems maintain for each file, but there is
2673 * currently no usable API to query this, hence let's
2674 * emulate this via extended attributes. If extended
2675 * attributes are not supported we'll just skip this,
7517e174 2676 * and rely solely on mtime/atime/ctime of the file. */
fb0951b0 2677
d61b600d 2678 fd_setcrtime(f->fd, 0);
7560fffc 2679
feb12d3e 2680#ifdef HAVE_GCRYPT
0284adc6 2681 /* Try to load the FSPRG state, and if we can't, then
baed47c3 2682 * just don't do sealing */
49a32d43
LP
2683 if (f->seal) {
2684 r = journal_file_fss_load(f);
2685 if (r < 0)
2686 f->seal = false;
2687 }
feb12d3e 2688#endif
7560fffc 2689
0284adc6
LP
2690 r = journal_file_init_header(f, template);
2691 if (r < 0)
2692 goto fail;
7560fffc 2693
2678031a
LP
2694 r = journal_file_fstat(f);
2695 if (r < 0)
0284adc6 2696 goto fail;
fb0951b0
LP
2697
2698 newly_created = true;
0284adc6 2699 }
7560fffc 2700
0284adc6
LP
2701 if (f->last_stat.st_size < (off_t) HEADER_SIZE_MIN) {
2702 r = -EIO;
2703 goto fail;
2704 }
7560fffc 2705
fa6ac760 2706 r = mmap_cache_get(f->mmap, f->fd, f->prot, CONTEXT_HEADER, true, 0, PAGE_ALIGN(sizeof(Header)), &f->last_stat, &h);
977eaa1e 2707 if (r < 0)
0284adc6 2708 goto fail;
7560fffc 2709
fa6ac760
LP
2710 f->header = h;
2711
0284adc6
LP
2712 if (!newly_created) {
2713 r = journal_file_verify_header(f);
2714 if (r < 0)
2715 goto fail;
2716 }
7560fffc 2717
feb12d3e 2718#ifdef HAVE_GCRYPT
0284adc6 2719 if (!newly_created && f->writable) {
baed47c3 2720 r = journal_file_fss_load(f);
0284adc6
LP
2721 if (r < 0)
2722 goto fail;
2723 }
feb12d3e 2724#endif
cec736d2
LP
2725
2726 if (f->writable) {
4a92baf3
LP
2727 if (metrics) {
2728 journal_default_metrics(metrics, f->fd);
2729 f->metrics = *metrics;
2730 } else if (template)
2731 f->metrics = template->metrics;
2732
cec736d2
LP
2733 r = journal_file_refresh_header(f);
2734 if (r < 0)
2735 goto fail;
2736 }
2737
feb12d3e 2738#ifdef HAVE_GCRYPT
baed47c3 2739 r = journal_file_hmac_setup(f);
14d10188
LP
2740 if (r < 0)
2741 goto fail;
feb12d3e 2742#endif
14d10188 2743
cec736d2 2744 if (newly_created) {
de190aef 2745 r = journal_file_setup_field_hash_table(f);
cec736d2
LP
2746 if (r < 0)
2747 goto fail;
2748
de190aef 2749 r = journal_file_setup_data_hash_table(f);
cec736d2
LP
2750 if (r < 0)
2751 goto fail;
7560fffc 2752
feb12d3e 2753#ifdef HAVE_GCRYPT
7560fffc
LP
2754 r = journal_file_append_first_tag(f);
2755 if (r < 0)
2756 goto fail;
feb12d3e 2757#endif
cec736d2
LP
2758 }
2759
fa6ac760
LP
2760 if (mmap_cache_got_sigbus(f->mmap, f->fd)) {
2761 r = -EIO;
2762 goto fail;
2763 }
2764
0559d3a5 2765 *ret = f;
cec736d2
LP
2766 return 0;
2767
2768fail:
fa6ac760
LP
2769 if (f->fd >= 0 && mmap_cache_got_sigbus(f->mmap, f->fd))
2770 r = -EIO;
2771
cec736d2
LP
2772 journal_file_close(f);
2773
2774 return r;
2775}
0ac38b70 2776
baed47c3 2777int journal_file_rotate(JournalFile **f, bool compress, bool seal) {
57535f47 2778 _cleanup_free_ char *p = NULL;
0ac38b70
LP
2779 size_t l;
2780 JournalFile *old_file, *new_file = NULL;
2781 int r;
2782
2783 assert(f);
2784 assert(*f);
2785
2786 old_file = *f;
2787
2788 if (!old_file->writable)
2789 return -EINVAL;
2790
2791 if (!endswith(old_file->path, ".journal"))
2792 return -EINVAL;
2793
2794 l = strlen(old_file->path);
57535f47
ZJS
2795 r = asprintf(&p, "%.*s@" SD_ID128_FORMAT_STR "-%016"PRIx64"-%016"PRIx64".journal",
2796 (int) l - 8, old_file->path,
2797 SD_ID128_FORMAT_VAL(old_file->header->seqnum_id),
2798 le64toh((*f)->header->head_entry_seqnum),
2799 le64toh((*f)->header->head_entry_realtime));
2800 if (r < 0)
0ac38b70
LP
2801 return -ENOMEM;
2802
2678031a
LP
2803 /* Try to rename the file to the archived version. If the file
2804 * already was deleted, we'll get ENOENT, let's ignore that
2805 * case. */
0ac38b70 2806 r = rename(old_file->path, p);
2678031a 2807 if (r < 0 && errno != ENOENT)
0ac38b70
LP
2808 return -errno;
2809
ccdbaf91 2810 old_file->header->state = STATE_ARCHIVED;
0ac38b70 2811
f27a3864
LP
2812 /* Currently, btrfs is not very good with out write patterns
2813 * and fragments heavily. Let's defrag our journal files when
2814 * we archive them */
2815 old_file->defrag_on_close = true;
2816
baed47c3 2817 r = journal_file_open(old_file->path, old_file->flags, old_file->mode, compress, seal, NULL, old_file->mmap, old_file, &new_file);
0ac38b70
LP
2818 journal_file_close(old_file);
2819
2820 *f = new_file;
2821 return r;
2822}
2823
9447a7f1
LP
2824int journal_file_open_reliably(
2825 const char *fname,
2826 int flags,
2827 mode_t mode,
7560fffc 2828 bool compress,
baed47c3 2829 bool seal,
4a92baf3 2830 JournalMetrics *metrics,
27370278 2831 MMapCache *mmap_cache,
9447a7f1
LP
2832 JournalFile *template,
2833 JournalFile **ret) {
2834
2835 int r;
2836 size_t l;
ed375beb 2837 _cleanup_free_ char *p = NULL;
9447a7f1 2838
070052ab 2839 r = journal_file_open(fname, flags, mode, compress, seal, metrics, mmap_cache, template, ret);
288359db
ZJS
2840 if (!IN_SET(r,
2841 -EBADMSG, /* corrupted */
2842 -ENODATA, /* truncated */
2843 -EHOSTDOWN, /* other machine */
2844 -EPROTONOSUPPORT, /* incompatible feature */
2845 -EBUSY, /* unclean shutdown */
2846 -ESHUTDOWN, /* already archived */
2847 -EIO, /* IO error, including SIGBUS on mmap */
2848 -EIDRM /* File has been deleted */))
9447a7f1
LP
2849 return r;
2850
2851 if ((flags & O_ACCMODE) == O_RDONLY)
2852 return r;
2853
2854 if (!(flags & O_CREAT))
2855 return r;
2856
7560fffc
LP
2857 if (!endswith(fname, ".journal"))
2858 return r;
2859
5c70eab4
LP
2860 /* The file is corrupted. Rotate it away and try it again (but only once) */
2861
9447a7f1 2862 l = strlen(fname);
d587eca5 2863 if (asprintf(&p, "%.*s@%016"PRIx64 "-%016"PRIx64 ".journal~",
57535f47 2864 (int) l - 8, fname,
d587eca5 2865 now(CLOCK_REALTIME),
9bf3b535 2866 random_u64()) < 0)
9447a7f1
LP
2867 return -ENOMEM;
2868
65089b82 2869 if (rename(fname, p) < 0)
9447a7f1
LP
2870 return -errno;
2871
f27a3864
LP
2872 /* btrfs doesn't cope well with our write pattern and
2873 * fragments heavily. Let's defrag all files we rotate */
11689d2a
LP
2874
2875 (void) chattr_path(p, false, FS_NOCOW_FL);
f27a3864
LP
2876 (void) btrfs_defrag(p);
2877
65089b82 2878 log_warning_errno(r, "File %s corrupted or uncleanly shut down, renaming and replacing.", fname);
9447a7f1 2879
070052ab 2880 return journal_file_open(fname, flags, mode, compress, seal, metrics, mmap_cache, template, ret);
9447a7f1
LP
2881}
2882
cf244689
LP
2883int journal_file_copy_entry(JournalFile *from, JournalFile *to, Object *o, uint64_t p, uint64_t *seqnum, Object **ret, uint64_t *offset) {
2884 uint64_t i, n;
2885 uint64_t q, xor_hash = 0;
2886 int r;
2887 EntryItem *items;
2888 dual_timestamp ts;
2889
2890 assert(from);
2891 assert(to);
2892 assert(o);
2893 assert(p);
2894
2895 if (!to->writable)
2896 return -EPERM;
2897
2898 ts.monotonic = le64toh(o->entry.monotonic);
2899 ts.realtime = le64toh(o->entry.realtime);
2900
cf244689 2901 n = journal_file_entry_n_items(o);
4faa7004
TA
2902 /* alloca() can't take 0, hence let's allocate at least one */
2903 items = alloca(sizeof(EntryItem) * MAX(1u, n));
cf244689
LP
2904
2905 for (i = 0; i < n; i++) {
4fd052ae
FC
2906 uint64_t l, h;
2907 le64_t le_hash;
cf244689
LP
2908 size_t t;
2909 void *data;
2910 Object *u;
2911
2912 q = le64toh(o->entry.items[i].object_offset);
2913 le_hash = o->entry.items[i].hash;
2914
2915 r = journal_file_move_to_object(from, OBJECT_DATA, q, &o);
2916 if (r < 0)
2917 return r;
2918
2919 if (le_hash != o->data.hash)
2920 return -EBADMSG;
2921
2922 l = le64toh(o->object.size) - offsetof(Object, data.payload);
2923 t = (size_t) l;
2924
2925 /* We hit the limit on 32bit machines */
2926 if ((uint64_t) t != l)
2927 return -E2BIG;
2928
d89c8fdf 2929 if (o->object.flags & OBJECT_COMPRESSION_MASK) {
3b1a55e1 2930#if defined(HAVE_XZ) || defined(HAVE_LZ4)
a7f7d1bd 2931 size_t rsize = 0;
cf244689 2932
d89c8fdf
ZJS
2933 r = decompress_blob(o->object.flags & OBJECT_COMPRESSION_MASK,
2934 o->data.payload, l, &from->compress_buffer, &from->compress_buffer_size, &rsize, 0);
2935 if (r < 0)
2936 return r;
cf244689
LP
2937
2938 data = from->compress_buffer;
2939 l = rsize;
3b1a55e1
ZJS
2940#else
2941 return -EPROTONOSUPPORT;
2942#endif
cf244689
LP
2943 } else
2944 data = o->data.payload;
2945
2946 r = journal_file_append_data(to, data, l, &u, &h);
2947 if (r < 0)
2948 return r;
2949
2950 xor_hash ^= le64toh(u->data.hash);
2951 items[i].object_offset = htole64(h);
2952 items[i].hash = u->data.hash;
2953
2954 r = journal_file_move_to_object(from, OBJECT_ENTRY, p, &o);
2955 if (r < 0)
2956 return r;
2957 }
2958
fa6ac760
LP
2959 r = journal_file_append_entry_internal(to, &ts, xor_hash, items, n, seqnum, ret, offset);
2960
2961 if (mmap_cache_got_sigbus(to->mmap, to->fd))
2962 return -EIO;
2963
2964 return r;
cf244689 2965}
babfc091 2966
8580d1f7
LP
2967void journal_reset_metrics(JournalMetrics *m) {
2968 assert(m);
2969
2970 /* Set everything to "pick automatic values". */
2971
2972 *m = (JournalMetrics) {
2973 .min_use = (uint64_t) -1,
2974 .max_use = (uint64_t) -1,
2975 .min_size = (uint64_t) -1,
2976 .max_size = (uint64_t) -1,
2977 .keep_free = (uint64_t) -1,
2978 .n_max_files = (uint64_t) -1,
2979 };
2980}
2981
babfc091 2982void journal_default_metrics(JournalMetrics *m, int fd) {
8580d1f7 2983 char a[FORMAT_BYTES_MAX], b[FORMAT_BYTES_MAX], c[FORMAT_BYTES_MAX], d[FORMAT_BYTES_MAX], e[FORMAT_BYTES_MAX];
babfc091 2984 struct statvfs ss;
8580d1f7 2985 uint64_t fs_size;
babfc091
LP
2986
2987 assert(m);
2988 assert(fd >= 0);
2989
2990 if (fstatvfs(fd, &ss) >= 0)
2991 fs_size = ss.f_frsize * ss.f_blocks;
8580d1f7
LP
2992 else {
2993 log_debug_errno(errno, "Failed to detremine disk size: %m");
2994 fs_size = 0;
2995 }
babfc091
LP
2996
2997 if (m->max_use == (uint64_t) -1) {
2998
2999 if (fs_size > 0) {
3000 m->max_use = PAGE_ALIGN(fs_size / 10); /* 10% of file system size */
3001
3002 if (m->max_use > DEFAULT_MAX_USE_UPPER)
3003 m->max_use = DEFAULT_MAX_USE_UPPER;
3004
3005 if (m->max_use < DEFAULT_MAX_USE_LOWER)
3006 m->max_use = DEFAULT_MAX_USE_LOWER;
3007 } else
3008 m->max_use = DEFAULT_MAX_USE_LOWER;
3009 } else {
3010 m->max_use = PAGE_ALIGN(m->max_use);
3011
8580d1f7 3012 if (m->max_use != 0 && m->max_use < JOURNAL_FILE_SIZE_MIN*2)
babfc091
LP
3013 m->max_use = JOURNAL_FILE_SIZE_MIN*2;
3014 }
3015
8580d1f7
LP
3016 if (m->min_use == (uint64_t) -1)
3017 m->min_use = DEFAULT_MIN_USE;
3018
3019 if (m->min_use > m->max_use)
3020 m->min_use = m->max_use;
3021
babfc091
LP
3022 if (m->max_size == (uint64_t) -1) {
3023 m->max_size = PAGE_ALIGN(m->max_use / 8); /* 8 chunks */
3024
3025 if (m->max_size > DEFAULT_MAX_SIZE_UPPER)
3026 m->max_size = DEFAULT_MAX_SIZE_UPPER;
3027 } else
3028 m->max_size = PAGE_ALIGN(m->max_size);
3029
8580d1f7
LP
3030 if (m->max_size != 0) {
3031 if (m->max_size < JOURNAL_FILE_SIZE_MIN)
3032 m->max_size = JOURNAL_FILE_SIZE_MIN;
babfc091 3033
8580d1f7
LP
3034 if (m->max_use != 0 && m->max_size*2 > m->max_use)
3035 m->max_use = m->max_size*2;
3036 }
babfc091
LP
3037
3038 if (m->min_size == (uint64_t) -1)
3039 m->min_size = JOURNAL_FILE_SIZE_MIN;
3040 else {
3041 m->min_size = PAGE_ALIGN(m->min_size);
3042
3043 if (m->min_size < JOURNAL_FILE_SIZE_MIN)
3044 m->min_size = JOURNAL_FILE_SIZE_MIN;
3045
8580d1f7 3046 if (m->max_size != 0 && m->min_size > m->max_size)
babfc091
LP
3047 m->max_size = m->min_size;
3048 }
3049
3050 if (m->keep_free == (uint64_t) -1) {
3051
3052 if (fs_size > 0) {
8621b110 3053 m->keep_free = PAGE_ALIGN(fs_size * 3 / 20); /* 15% of file system size */
babfc091
LP
3054
3055 if (m->keep_free > DEFAULT_KEEP_FREE_UPPER)
3056 m->keep_free = DEFAULT_KEEP_FREE_UPPER;
3057
3058 } else
3059 m->keep_free = DEFAULT_KEEP_FREE;
3060 }
3061
8580d1f7
LP
3062 if (m->n_max_files == (uint64_t) -1)
3063 m->n_max_files = DEFAULT_N_MAX_FILES;
3064
3065 log_debug("Fixed min_use=%s max_use=%s max_size=%s min_size=%s keep_free=%s n_max_files=%" PRIu64,
3066 format_bytes(a, sizeof(a), m->min_use),
3067 format_bytes(b, sizeof(b), m->max_use),
3068 format_bytes(c, sizeof(c), m->max_size),
3069 format_bytes(d, sizeof(d), m->min_size),
3070 format_bytes(e, sizeof(e), m->keep_free),
3071 m->n_max_files);
babfc091 3072}
08984293
LP
3073
3074int journal_file_get_cutoff_realtime_usec(JournalFile *f, usec_t *from, usec_t *to) {
08984293
LP
3075 assert(f);
3076 assert(from || to);
3077
3078 if (from) {
162566a4
LP
3079 if (f->header->head_entry_realtime == 0)
3080 return -ENOENT;
08984293 3081
162566a4 3082 *from = le64toh(f->header->head_entry_realtime);
08984293
LP
3083 }
3084
3085 if (to) {
162566a4
LP
3086 if (f->header->tail_entry_realtime == 0)
3087 return -ENOENT;
08984293 3088
162566a4 3089 *to = le64toh(f->header->tail_entry_realtime);
08984293
LP
3090 }
3091
3092 return 1;
3093}
3094
3095int journal_file_get_cutoff_monotonic_usec(JournalFile *f, sd_id128_t boot_id, usec_t *from, usec_t *to) {
08984293
LP
3096 Object *o;
3097 uint64_t p;
3098 int r;
3099
3100 assert(f);
3101 assert(from || to);
3102
47838ab3 3103 r = find_data_object_by_boot_id(f, boot_id, &o, &p);
08984293
LP
3104 if (r <= 0)
3105 return r;
3106
3107 if (le64toh(o->data.n_entries) <= 0)
3108 return 0;
3109
3110 if (from) {
3111 r = journal_file_move_to_object(f, OBJECT_ENTRY, le64toh(o->data.entry_offset), &o);
3112 if (r < 0)
3113 return r;
3114
3115 *from = le64toh(o->entry.monotonic);
3116 }
3117
3118 if (to) {
3119 r = journal_file_move_to_object(f, OBJECT_DATA, p, &o);
3120 if (r < 0)
3121 return r;
3122
3123 r = generic_array_get_plus_one(f,
3124 le64toh(o->data.entry_offset),
3125 le64toh(o->data.entry_array_offset),
3126 le64toh(o->data.n_entries)-1,
3127 &o, NULL);
3128 if (r <= 0)
3129 return r;
3130
3131 *to = le64toh(o->entry.monotonic);
3132 }
3133
3134 return 1;
3135}
dca6219e 3136
fb0951b0 3137bool journal_file_rotate_suggested(JournalFile *f, usec_t max_file_usec) {
dca6219e
LP
3138 assert(f);
3139
3140 /* If we gained new header fields we gained new features,
3141 * hence suggest a rotation */
361f9cbc
LP
3142 if (le64toh(f->header->header_size) < sizeof(Header)) {
3143 log_debug("%s uses an outdated header, suggesting rotation.", f->path);
dca6219e 3144 return true;
361f9cbc 3145 }
dca6219e
LP
3146
3147 /* Let's check if the hash tables grew over a certain fill
3148 * level (75%, borrowing this value from Java's hash table
3149 * implementation), and if so suggest a rotation. To calculate
3150 * the fill level we need the n_data field, which only exists
3151 * in newer versions. */
3152
3153 if (JOURNAL_HEADER_CONTAINS(f->header, n_data))
361f9cbc 3154 if (le64toh(f->header->n_data) * 4ULL > (le64toh(f->header->data_hash_table_size) / sizeof(HashItem)) * 3ULL) {
507f22bd 3155 log_debug("Data hash table of %s has a fill level at %.1f (%"PRIu64" of %"PRIu64" items, %llu file size, %"PRIu64" bytes per hash table item), suggesting rotation.",
361f9cbc
LP
3156 f->path,
3157 100.0 * (double) le64toh(f->header->n_data) / ((double) (le64toh(f->header->data_hash_table_size) / sizeof(HashItem))),
507f22bd
ZJS
3158 le64toh(f->header->n_data),
3159 le64toh(f->header->data_hash_table_size) / sizeof(HashItem),
3160 (unsigned long long) f->last_stat.st_size,
3161 f->last_stat.st_size / le64toh(f->header->n_data));
dca6219e 3162 return true;
361f9cbc 3163 }
dca6219e
LP
3164
3165 if (JOURNAL_HEADER_CONTAINS(f->header, n_fields))
361f9cbc 3166 if (le64toh(f->header->n_fields) * 4ULL > (le64toh(f->header->field_hash_table_size) / sizeof(HashItem)) * 3ULL) {
507f22bd 3167 log_debug("Field hash table of %s has a fill level at %.1f (%"PRIu64" of %"PRIu64" items), suggesting rotation.",
361f9cbc
LP
3168 f->path,
3169 100.0 * (double) le64toh(f->header->n_fields) / ((double) (le64toh(f->header->field_hash_table_size) / sizeof(HashItem))),
507f22bd
ZJS
3170 le64toh(f->header->n_fields),
3171 le64toh(f->header->field_hash_table_size) / sizeof(HashItem));
dca6219e 3172 return true;
361f9cbc 3173 }
dca6219e 3174
0598fd4a
LP
3175 /* Are the data objects properly indexed by field objects? */
3176 if (JOURNAL_HEADER_CONTAINS(f->header, n_data) &&
3177 JOURNAL_HEADER_CONTAINS(f->header, n_fields) &&
3178 le64toh(f->header->n_data) > 0 &&
3179 le64toh(f->header->n_fields) == 0)
3180 return true;
3181
fb0951b0
LP
3182 if (max_file_usec > 0) {
3183 usec_t t, h;
3184
3185 h = le64toh(f->header->head_entry_realtime);
3186 t = now(CLOCK_REALTIME);
3187
3188 if (h > 0 && t > h + max_file_usec)
3189 return true;
3190 }
3191
dca6219e
LP
3192 return false;
3193}