]> git.ipfire.org Git - thirdparty/systemd.git/blame - src/journal/journal-file.c
shared: add process-util.[ch]
[thirdparty/systemd.git] / src / journal / journal-file.c
CommitLineData
cec736d2
LP
1/*-*- Mode: C; c-basic-offset: 8; indent-tabs-mode: nil -*-*/
2
3/***
4 This file is part of systemd.
5
6 Copyright 2011 Lennart Poettering
7
8 systemd is free software; you can redistribute it and/or modify it
5430f7f2
LP
9 under the terms of the GNU Lesser General Public License as published by
10 the Free Software Foundation; either version 2.1 of the License, or
cec736d2
LP
11 (at your option) any later version.
12
13 systemd is distributed in the hope that it will be useful, but
14 WITHOUT ANY WARRANTY; without even the implied warranty of
15 MERCHANTABILITY or FITNESS FOR A PARTICULAR PURPOSE. See the GNU
5430f7f2 16 Lesser General Public License for more details.
cec736d2 17
5430f7f2 18 You should have received a copy of the GNU Lesser General Public License
cec736d2
LP
19 along with systemd; If not, see <http://www.gnu.org/licenses/>.
20***/
21
22#include <sys/mman.h>
23#include <errno.h>
24#include <sys/uio.h>
25#include <unistd.h>
26#include <sys/statvfs.h>
27#include <fcntl.h>
28#include <stddef.h>
11689d2a 29#include <linux/fs.h>
fb0951b0 30
f27a3864 31#include "btrfs-util.h"
cec736d2
LP
32#include "journal-def.h"
33#include "journal-file.h"
0284adc6 34#include "journal-authenticate.h"
cec736d2 35#include "lookup3.h"
807e17f0 36#include "compress.h"
cec736d2 37
4a92baf3
LP
38#define DEFAULT_DATA_HASH_TABLE_SIZE (2047ULL*sizeof(HashItem))
39#define DEFAULT_FIELD_HASH_TABLE_SIZE (333ULL*sizeof(HashItem))
cec736d2 40
be19b7df 41#define COMPRESSION_SIZE_THRESHOLD (512ULL)
807e17f0 42
babfc091 43/* This is the minimum journal file size */
253f59df 44#define JOURNAL_FILE_SIZE_MIN (4ULL*1024ULL*1024ULL) /* 4 MiB */
babfc091
LP
45
46/* These are the lower and upper bounds if we deduce the max_use value
47 * from the file system size */
48#define DEFAULT_MAX_USE_LOWER (1ULL*1024ULL*1024ULL) /* 1 MiB */
49#define DEFAULT_MAX_USE_UPPER (4ULL*1024ULL*1024ULL*1024ULL) /* 4 GiB */
50
51/* This is the upper bound if we deduce max_size from max_use */
71100051 52#define DEFAULT_MAX_SIZE_UPPER (128ULL*1024ULL*1024ULL) /* 128 MiB */
babfc091
LP
53
54/* This is the upper bound if we deduce the keep_free value from the
55 * file system size */
56#define DEFAULT_KEEP_FREE_UPPER (4ULL*1024ULL*1024ULL*1024ULL) /* 4 GiB */
57
58/* This is the keep_free value when we can't determine the system
59 * size */
60#define DEFAULT_KEEP_FREE (1024ULL*1024ULL) /* 1 MB */
61
dca6219e
LP
62/* n_data was the first entry we added after the initial file format design */
63#define HEADER_SIZE_MIN ALIGN64(offsetof(Header, n_data))
cec736d2 64
a4bcff5b
LP
65/* How many entries to keep in the entry array chain cache at max */
66#define CHAIN_CACHE_MAX 20
67
a676e665
LP
68/* How much to increase the journal file size at once each time we allocate something new. */
69#define FILE_SIZE_INCREASE (8ULL*1024ULL*1024ULL) /* 8MB */
70
2678031a
LP
71/* Reread fstat() of the file for detecting deletions at least this often */
72#define LAST_STAT_REFRESH_USEC (5*USEC_PER_SEC)
73
fa6ac760
LP
74/* The mmap context to use for the header we pick as one above the last defined typed */
75#define CONTEXT_HEADER _OBJECT_TYPE_MAX
76
9588bc32 77static int journal_file_set_online(JournalFile *f) {
26687bf8
OS
78 assert(f);
79
80 if (!f->writable)
81 return -EPERM;
82
83 if (!(f->fd >= 0 && f->header))
84 return -EINVAL;
85
fa6ac760
LP
86 if (mmap_cache_got_sigbus(f->mmap, f->fd))
87 return -EIO;
88
26687bf8
OS
89 switch(f->header->state) {
90 case STATE_ONLINE:
91 return 0;
92
93 case STATE_OFFLINE:
94 f->header->state = STATE_ONLINE;
95 fsync(f->fd);
96 return 0;
97
98 default:
99 return -EINVAL;
100 }
101}
102
103int journal_file_set_offline(JournalFile *f) {
104 assert(f);
105
106 if (!f->writable)
107 return -EPERM;
108
109 if (!(f->fd >= 0 && f->header))
110 return -EINVAL;
111
112 if (f->header->state != STATE_ONLINE)
113 return 0;
114
115 fsync(f->fd);
116
fa6ac760
LP
117 if (mmap_cache_got_sigbus(f->mmap, f->fd))
118 return -EIO;
119
26687bf8
OS
120 f->header->state = STATE_OFFLINE;
121
fa6ac760
LP
122 if (mmap_cache_got_sigbus(f->mmap, f->fd))
123 return -EIO;
124
26687bf8
OS
125 fsync(f->fd);
126
127 return 0;
128}
129
cec736d2 130void journal_file_close(JournalFile *f) {
de190aef 131 assert(f);
cec736d2 132
feb12d3e 133#ifdef HAVE_GCRYPT
b0af6f41 134 /* Write the final tag */
c586dbf1 135 if (f->seal && f->writable)
b0af6f41 136 journal_file_append_tag(f);
feb12d3e 137#endif
b0af6f41 138
26687bf8 139 journal_file_set_offline(f);
cec736d2 140
fa6ac760
LP
141 if (f->mmap && f->fd >= 0)
142 mmap_cache_close_fd(f->mmap, f->fd);
cec736d2 143
11689d2a
LP
144 if (f->fd >= 0 && f->defrag_on_close) {
145
146 /* Be friendly to btrfs: turn COW back on again now,
147 * and defragment the file. We won't write to the file
148 * ever again, hence remove all fragmentation, and
149 * reenable all the good bits COW usually provides
150 * (such as data checksumming). */
151
1ed8f8c1 152 (void) chattr_fd(f->fd, 0, FS_NOCOW_FL);
11689d2a
LP
153 (void) btrfs_defrag_fd(f->fd);
154 }
f27a3864 155
03e334a1 156 safe_close(f->fd);
cec736d2 157 free(f->path);
807e17f0 158
16e9f408
LP
159 if (f->mmap)
160 mmap_cache_unref(f->mmap);
161
4743015d 162 ordered_hashmap_free_free(f->chain_cache);
a4bcff5b 163
d89c8fdf 164#if defined(HAVE_XZ) || defined(HAVE_LZ4)
807e17f0
LP
165 free(f->compress_buffer);
166#endif
167
7560fffc 168#ifdef HAVE_GCRYPT
baed47c3
LP
169 if (f->fss_file)
170 munmap(f->fss_file, PAGE_ALIGN(f->fss_file_size));
b7c9ae91
LP
171 else if (f->fsprg_state)
172 free(f->fsprg_state);
173
174 free(f->fsprg_seed);
7560fffc
LP
175
176 if (f->hmac)
177 gcry_md_close(f->hmac);
178#endif
179
cec736d2
LP
180 free(f);
181}
182
0ac38b70 183static int journal_file_init_header(JournalFile *f, JournalFile *template) {
d89c8fdf 184 Header h = {};
cec736d2
LP
185 ssize_t k;
186 int r;
187
188 assert(f);
189
7560fffc 190 memcpy(h.signature, HEADER_SIGNATURE, 8);
23b0b2b2 191 h.header_size = htole64(ALIGN64(sizeof(h)));
cec736d2 192
d89c8fdf
ZJS
193 h.incompatible_flags |= htole32(
194 f->compress_xz * HEADER_INCOMPATIBLE_COMPRESSED_XZ |
195 f->compress_lz4 * HEADER_INCOMPATIBLE_COMPRESSED_LZ4);
7560fffc 196
d89c8fdf
ZJS
197 h.compatible_flags = htole32(
198 f->seal * HEADER_COMPATIBLE_SEALED);
7560fffc 199
cec736d2
LP
200 r = sd_id128_randomize(&h.file_id);
201 if (r < 0)
202 return r;
203
0ac38b70
LP
204 if (template) {
205 h.seqnum_id = template->header->seqnum_id;
beec0085 206 h.tail_entry_seqnum = template->header->tail_entry_seqnum;
0ac38b70
LP
207 } else
208 h.seqnum_id = h.file_id;
cec736d2
LP
209
210 k = pwrite(f->fd, &h, sizeof(h), 0);
211 if (k < 0)
212 return -errno;
213
214 if (k != sizeof(h))
215 return -EIO;
216
217 return 0;
218}
219
220static int journal_file_refresh_header(JournalFile *f) {
de190aef 221 sd_id128_t boot_id;
fa6ac760 222 int r;
cec736d2
LP
223
224 assert(f);
225
226 r = sd_id128_get_machine(&f->header->machine_id);
227 if (r < 0)
228 return r;
229
de190aef 230 r = sd_id128_get_boot(&boot_id);
cec736d2
LP
231 if (r < 0)
232 return r;
233
de190aef
LP
234 if (sd_id128_equal(boot_id, f->header->boot_id))
235 f->tail_entry_monotonic_valid = true;
236
237 f->header->boot_id = boot_id;
238
fa6ac760 239 r = journal_file_set_online(f);
b788cc23 240
7560fffc 241 /* Sync the online state to disk */
a676e665 242 fsync(f->fd);
b788cc23 243
fa6ac760 244 return r;
cec736d2
LP
245}
246
247static int journal_file_verify_header(JournalFile *f) {
d89c8fdf
ZJS
248 uint32_t flags;
249
cec736d2
LP
250 assert(f);
251
7560fffc 252 if (memcmp(f->header->signature, HEADER_SIGNATURE, 8))
cec736d2
LP
253 return -EBADMSG;
254
7560fffc
LP
255 /* In both read and write mode we refuse to open files with
256 * incompatible flags we don't know */
d89c8fdf
ZJS
257 flags = le32toh(f->header->incompatible_flags);
258 if (flags & ~HEADER_INCOMPATIBLE_SUPPORTED) {
259 if (flags & ~HEADER_INCOMPATIBLE_ANY)
260 log_debug("Journal file %s has unknown incompatible flags %"PRIx32,
261 f->path, flags & ~HEADER_INCOMPATIBLE_ANY);
262 flags = (flags & HEADER_INCOMPATIBLE_ANY) & ~HEADER_INCOMPATIBLE_SUPPORTED;
263 if (flags)
264 log_debug("Journal file %s uses incompatible flags %"PRIx32
265 " disabled at compilation time.", f->path, flags);
cec736d2 266 return -EPROTONOSUPPORT;
d89c8fdf 267 }
cec736d2 268
7560fffc
LP
269 /* When open for writing we refuse to open files with
270 * compatible flags, too */
d89c8fdf
ZJS
271 flags = le32toh(f->header->compatible_flags);
272 if (f->writable && (flags & ~HEADER_COMPATIBLE_SUPPORTED)) {
273 if (flags & ~HEADER_COMPATIBLE_ANY)
274 log_debug("Journal file %s has unknown compatible flags %"PRIx32,
275 f->path, flags & ~HEADER_COMPATIBLE_ANY);
276 flags = (flags & HEADER_COMPATIBLE_ANY) & ~HEADER_COMPATIBLE_SUPPORTED;
277 if (flags)
278 log_debug("Journal file %s uses compatible flags %"PRIx32
279 " disabled at compilation time.", f->path, flags);
280 return -EPROTONOSUPPORT;
7560fffc
LP
281 }
282
db11ac1a
LP
283 if (f->header->state >= _STATE_MAX)
284 return -EBADMSG;
285
dca6219e
LP
286 /* The first addition was n_data, so check that we are at least this large */
287 if (le64toh(f->header->header_size) < HEADER_SIZE_MIN)
23b0b2b2
LP
288 return -EBADMSG;
289
8088cbd3 290 if (JOURNAL_HEADER_SEALED(f->header) && !JOURNAL_HEADER_CONTAINS(f->header, n_entry_arrays))
beec0085
LP
291 return -EBADMSG;
292
db11ac1a
LP
293 if ((le64toh(f->header->header_size) + le64toh(f->header->arena_size)) > (uint64_t) f->last_stat.st_size)
294 return -ENODATA;
295
296 if (le64toh(f->header->tail_object_offset) > (le64toh(f->header->header_size) + le64toh(f->header->arena_size)))
297 return -ENODATA;
298
7762e02b
LP
299 if (!VALID64(le64toh(f->header->data_hash_table_offset)) ||
300 !VALID64(le64toh(f->header->field_hash_table_offset)) ||
301 !VALID64(le64toh(f->header->tail_object_offset)) ||
302 !VALID64(le64toh(f->header->entry_array_offset)))
303 return -ENODATA;
304
cec736d2 305 if (f->writable) {
ccdbaf91 306 uint8_t state;
cec736d2
LP
307 sd_id128_t machine_id;
308 int r;
309
310 r = sd_id128_get_machine(&machine_id);
311 if (r < 0)
312 return r;
313
314 if (!sd_id128_equal(machine_id, f->header->machine_id))
315 return -EHOSTDOWN;
316
de190aef 317 state = f->header->state;
cec736d2 318
71fa6f00
LP
319 if (state == STATE_ONLINE) {
320 log_debug("Journal file %s is already online. Assuming unclean closing.", f->path);
321 return -EBUSY;
322 } else if (state == STATE_ARCHIVED)
cec736d2 323 return -ESHUTDOWN;
71fa6f00 324 else if (state != STATE_OFFLINE) {
8facc349 325 log_debug("Journal file %s has unknown state %i.", f->path, state);
71fa6f00
LP
326 return -EBUSY;
327 }
cec736d2
LP
328 }
329
d89c8fdf
ZJS
330 f->compress_xz = JOURNAL_HEADER_COMPRESSED_XZ(f->header);
331 f->compress_lz4 = JOURNAL_HEADER_COMPRESSED_LZ4(f->header);
c586dbf1 332
f1889c91 333 f->seal = JOURNAL_HEADER_SEALED(f->header);
7560fffc 334
cec736d2
LP
335 return 0;
336}
337
2678031a
LP
338static int journal_file_fstat(JournalFile *f) {
339 assert(f);
340 assert(f->fd >= 0);
341
342 if (fstat(f->fd, &f->last_stat) < 0)
343 return -errno;
344
345 f->last_stat_usec = now(CLOCK_MONOTONIC);
346
347 /* Refuse appending to files that are already deleted */
348 if (f->last_stat.st_nlink <= 0)
349 return -EIDRM;
350
351 return 0;
352}
353
cec736d2 354static int journal_file_allocate(JournalFile *f, uint64_t offset, uint64_t size) {
eda4b58b 355 uint64_t old_size, new_size;
fec2aa2f 356 int r;
cec736d2
LP
357
358 assert(f);
359
cec736d2 360 /* We assume that this file is not sparse, and we know that
38ac38b2 361 * for sure, since we always call posix_fallocate()
cec736d2
LP
362 * ourselves */
363
fa6ac760
LP
364 if (mmap_cache_got_sigbus(f->mmap, f->fd))
365 return -EIO;
366
cec736d2 367 old_size =
23b0b2b2 368 le64toh(f->header->header_size) +
cec736d2
LP
369 le64toh(f->header->arena_size);
370
bc85bfee 371 new_size = PAGE_ALIGN(offset + size);
23b0b2b2
LP
372 if (new_size < le64toh(f->header->header_size))
373 new_size = le64toh(f->header->header_size);
bc85bfee 374
2678031a
LP
375 if (new_size <= old_size) {
376
377 /* We already pre-allocated enough space, but before
378 * we write to it, let's check with fstat() if the
379 * file got deleted, in order make sure we don't throw
380 * away the data immediately. Don't check fstat() for
381 * all writes though, but only once ever 10s. */
382
383 if (f->last_stat_usec + LAST_STAT_REFRESH_USEC > now(CLOCK_MONOTONIC))
384 return 0;
385
386 return journal_file_fstat(f);
387 }
388
389 /* Allocate more space. */
cec736d2 390
a676e665 391 if (f->metrics.max_size > 0 && new_size > f->metrics.max_size)
bc85bfee 392 return -E2BIG;
cec736d2 393
a676e665 394 if (new_size > f->metrics.min_size && f->metrics.keep_free > 0) {
cec736d2
LP
395 struct statvfs svfs;
396
397 if (fstatvfs(f->fd, &svfs) >= 0) {
398 uint64_t available;
399
400 available = svfs.f_bfree * svfs.f_bsize;
401
bc85bfee
LP
402 if (available >= f->metrics.keep_free)
403 available -= f->metrics.keep_free;
cec736d2
LP
404 else
405 available = 0;
406
407 if (new_size - old_size > available)
408 return -E2BIG;
409 }
410 }
411
eda4b58b
LP
412 /* Increase by larger blocks at once */
413 new_size = ((new_size+FILE_SIZE_INCREASE-1) / FILE_SIZE_INCREASE) * FILE_SIZE_INCREASE;
414 if (f->metrics.max_size > 0 && new_size > f->metrics.max_size)
415 new_size = f->metrics.max_size;
416
bc85bfee
LP
417 /* Note that the glibc fallocate() fallback is very
418 inefficient, hence we try to minimize the allocation area
419 as we can. */
fec2aa2f
GV
420 r = posix_fallocate(f->fd, old_size, new_size - old_size);
421 if (r != 0)
422 return -r;
cec736d2 423
23b0b2b2 424 f->header->arena_size = htole64(new_size - le64toh(f->header->header_size));
cec736d2 425
2678031a 426 return journal_file_fstat(f);
cec736d2
LP
427}
428
78519831 429static unsigned type_to_context(ObjectType type) {
d3d3208f 430 /* One context for each type, plus one catch-all for the rest */
69adae51 431 assert_cc(_OBJECT_TYPE_MAX <= MMAP_CACHE_MAX_CONTEXTS);
fa6ac760 432 assert_cc(CONTEXT_HEADER < MMAP_CACHE_MAX_CONTEXTS);
d05089d8 433 return type > OBJECT_UNUSED && type < _OBJECT_TYPE_MAX ? type : 0;
d3d3208f
MS
434}
435
7a9dabea 436static int journal_file_move_to(JournalFile *f, ObjectType type, bool keep_always, uint64_t offset, uint64_t size, void **ret) {
2678031a
LP
437 int r;
438
cec736d2 439 assert(f);
cec736d2
LP
440 assert(ret);
441
7762e02b
LP
442 if (size <= 0)
443 return -EINVAL;
444
2a59ea54 445 /* Avoid SIGBUS on invalid accesses */
4bbdcdb3
LP
446 if (offset + size > (uint64_t) f->last_stat.st_size) {
447 /* Hmm, out of range? Let's refresh the fstat() data
448 * first, before we trust that check. */
449
2678031a
LP
450 r = journal_file_fstat(f);
451 if (r < 0)
452 return r;
453
454 if (offset + size > (uint64_t) f->last_stat.st_size)
4bbdcdb3
LP
455 return -EADDRNOTAVAIL;
456 }
457
7a9dabea 458 return mmap_cache_get(f->mmap, f->fd, f->prot, type_to_context(type), keep_always, offset, size, &f->last_stat, ret);
cec736d2
LP
459}
460
16e9f408
LP
461static uint64_t minimum_header_size(Object *o) {
462
b8e891e6 463 static const uint64_t table[] = {
16e9f408
LP
464 [OBJECT_DATA] = sizeof(DataObject),
465 [OBJECT_FIELD] = sizeof(FieldObject),
466 [OBJECT_ENTRY] = sizeof(EntryObject),
467 [OBJECT_DATA_HASH_TABLE] = sizeof(HashTableObject),
468 [OBJECT_FIELD_HASH_TABLE] = sizeof(HashTableObject),
469 [OBJECT_ENTRY_ARRAY] = sizeof(EntryArrayObject),
470 [OBJECT_TAG] = sizeof(TagObject),
471 };
472
473 if (o->object.type >= ELEMENTSOF(table) || table[o->object.type] <= 0)
474 return sizeof(ObjectHeader);
475
476 return table[o->object.type];
477}
478
78519831 479int journal_file_move_to_object(JournalFile *f, ObjectType type, uint64_t offset, Object **ret) {
cec736d2
LP
480 int r;
481 void *t;
482 Object *o;
483 uint64_t s;
484
485 assert(f);
486 assert(ret);
487
db11ac1a
LP
488 /* Objects may only be located at multiple of 64 bit */
489 if (!VALID64(offset))
490 return -EFAULT;
491
7a9dabea 492 r = journal_file_move_to(f, type, false, offset, sizeof(ObjectHeader), &t);
cec736d2
LP
493 if (r < 0)
494 return r;
495
496 o = (Object*) t;
497 s = le64toh(o->object.size);
498
499 if (s < sizeof(ObjectHeader))
500 return -EBADMSG;
501
16e9f408
LP
502 if (o->object.type <= OBJECT_UNUSED)
503 return -EBADMSG;
504
505 if (s < minimum_header_size(o))
506 return -EBADMSG;
507
d05089d8 508 if (type > OBJECT_UNUSED && o->object.type != type)
cec736d2
LP
509 return -EBADMSG;
510
511 if (s > sizeof(ObjectHeader)) {
7a9dabea 512 r = journal_file_move_to(f, type, false, offset, s, &t);
cec736d2
LP
513 if (r < 0)
514 return r;
515
516 o = (Object*) t;
517 }
518
cec736d2
LP
519 *ret = o;
520 return 0;
521}
522
d98cc1f2 523static uint64_t journal_file_entry_seqnum(JournalFile *f, uint64_t *seqnum) {
cec736d2
LP
524 uint64_t r;
525
526 assert(f);
527
beec0085 528 r = le64toh(f->header->tail_entry_seqnum) + 1;
c2373f84
LP
529
530 if (seqnum) {
de190aef 531 /* If an external seqnum counter was passed, we update
c2373f84
LP
532 * both the local and the external one, and set it to
533 * the maximum of both */
534
535 if (*seqnum + 1 > r)
536 r = *seqnum + 1;
537
538 *seqnum = r;
539 }
540
beec0085 541 f->header->tail_entry_seqnum = htole64(r);
cec736d2 542
beec0085
LP
543 if (f->header->head_entry_seqnum == 0)
544 f->header->head_entry_seqnum = htole64(r);
de190aef 545
cec736d2
LP
546 return r;
547}
548
78519831 549int journal_file_append_object(JournalFile *f, ObjectType type, uint64_t size, Object **ret, uint64_t *offset) {
cec736d2
LP
550 int r;
551 uint64_t p;
552 Object *tail, *o;
553 void *t;
554
555 assert(f);
d05089d8 556 assert(type > OBJECT_UNUSED && type < _OBJECT_TYPE_MAX);
cec736d2
LP
557 assert(size >= sizeof(ObjectHeader));
558 assert(offset);
559 assert(ret);
560
26687bf8
OS
561 r = journal_file_set_online(f);
562 if (r < 0)
563 return r;
564
cec736d2 565 p = le64toh(f->header->tail_object_offset);
cec736d2 566 if (p == 0)
23b0b2b2 567 p = le64toh(f->header->header_size);
cec736d2 568 else {
d05089d8 569 r = journal_file_move_to_object(f, OBJECT_UNUSED, p, &tail);
cec736d2
LP
570 if (r < 0)
571 return r;
572
573 p += ALIGN64(le64toh(tail->object.size));
574 }
575
576 r = journal_file_allocate(f, p, size);
577 if (r < 0)
578 return r;
579
fcde2389 580 r = journal_file_move_to(f, type, false, p, size, &t);
cec736d2
LP
581 if (r < 0)
582 return r;
583
584 o = (Object*) t;
585
586 zero(o->object);
de190aef 587 o->object.type = type;
cec736d2
LP
588 o->object.size = htole64(size);
589
590 f->header->tail_object_offset = htole64(p);
cec736d2
LP
591 f->header->n_objects = htole64(le64toh(f->header->n_objects) + 1);
592
593 *ret = o;
594 *offset = p;
595
596 return 0;
597}
598
de190aef 599static int journal_file_setup_data_hash_table(JournalFile *f) {
cec736d2
LP
600 uint64_t s, p;
601 Object *o;
602 int r;
603
604 assert(f);
605
dfabe643 606 /* We estimate that we need 1 hash table entry per 768 of
4a92baf3
LP
607 journal file and we want to make sure we never get beyond
608 75% fill level. Calculate the hash table size for the
609 maximum file size based on these metrics. */
610
dfabe643 611 s = (f->metrics.max_size * 4 / 768 / 3) * sizeof(HashItem);
4a92baf3
LP
612 if (s < DEFAULT_DATA_HASH_TABLE_SIZE)
613 s = DEFAULT_DATA_HASH_TABLE_SIZE;
614
507f22bd 615 log_debug("Reserving %"PRIu64" entries in hash table.", s / sizeof(HashItem));
4a92baf3 616
de190aef
LP
617 r = journal_file_append_object(f,
618 OBJECT_DATA_HASH_TABLE,
619 offsetof(Object, hash_table.items) + s,
620 &o, &p);
cec736d2
LP
621 if (r < 0)
622 return r;
623
29804cc1 624 memzero(o->hash_table.items, s);
cec736d2 625
de190aef
LP
626 f->header->data_hash_table_offset = htole64(p + offsetof(Object, hash_table.items));
627 f->header->data_hash_table_size = htole64(s);
cec736d2
LP
628
629 return 0;
630}
631
de190aef 632static int journal_file_setup_field_hash_table(JournalFile *f) {
cec736d2
LP
633 uint64_t s, p;
634 Object *o;
635 int r;
636
637 assert(f);
638
3c1668da
LP
639 /* We use a fixed size hash table for the fields as this
640 * number should grow very slowly only */
641
de190aef
LP
642 s = DEFAULT_FIELD_HASH_TABLE_SIZE;
643 r = journal_file_append_object(f,
644 OBJECT_FIELD_HASH_TABLE,
645 offsetof(Object, hash_table.items) + s,
646 &o, &p);
cec736d2
LP
647 if (r < 0)
648 return r;
649
29804cc1 650 memzero(o->hash_table.items, s);
cec736d2 651
de190aef
LP
652 f->header->field_hash_table_offset = htole64(p + offsetof(Object, hash_table.items));
653 f->header->field_hash_table_size = htole64(s);
cec736d2
LP
654
655 return 0;
656}
657
de190aef 658static int journal_file_map_data_hash_table(JournalFile *f) {
cec736d2
LP
659 uint64_t s, p;
660 void *t;
661 int r;
662
663 assert(f);
664
de190aef
LP
665 p = le64toh(f->header->data_hash_table_offset);
666 s = le64toh(f->header->data_hash_table_size);
cec736d2 667
de190aef 668 r = journal_file_move_to(f,
16e9f408 669 OBJECT_DATA_HASH_TABLE,
fcde2389 670 true,
de190aef
LP
671 p, s,
672 &t);
cec736d2
LP
673 if (r < 0)
674 return r;
675
de190aef 676 f->data_hash_table = t;
cec736d2
LP
677 return 0;
678}
679
de190aef 680static int journal_file_map_field_hash_table(JournalFile *f) {
cec736d2
LP
681 uint64_t s, p;
682 void *t;
683 int r;
684
685 assert(f);
686
de190aef
LP
687 p = le64toh(f->header->field_hash_table_offset);
688 s = le64toh(f->header->field_hash_table_size);
cec736d2 689
de190aef 690 r = journal_file_move_to(f,
16e9f408 691 OBJECT_FIELD_HASH_TABLE,
fcde2389 692 true,
de190aef
LP
693 p, s,
694 &t);
cec736d2
LP
695 if (r < 0)
696 return r;
697
de190aef 698 f->field_hash_table = t;
cec736d2
LP
699 return 0;
700}
701
3c1668da
LP
702static int journal_file_link_field(
703 JournalFile *f,
704 Object *o,
705 uint64_t offset,
706 uint64_t hash) {
707
805d1486 708 uint64_t p, h, m;
3c1668da
LP
709 int r;
710
711 assert(f);
712 assert(o);
713 assert(offset > 0);
714
715 if (o->object.type != OBJECT_FIELD)
716 return -EINVAL;
717
805d1486
LP
718 m = le64toh(f->header->field_hash_table_size) / sizeof(HashItem);
719 if (m <= 0)
720 return -EBADMSG;
3c1668da 721
805d1486 722 /* This might alter the window we are looking at */
3c1668da
LP
723 o->field.next_hash_offset = o->field.head_data_offset = 0;
724
805d1486 725 h = hash % m;
3c1668da
LP
726 p = le64toh(f->field_hash_table[h].tail_hash_offset);
727 if (p == 0)
728 f->field_hash_table[h].head_hash_offset = htole64(offset);
729 else {
730 r = journal_file_move_to_object(f, OBJECT_FIELD, p, &o);
731 if (r < 0)
732 return r;
733
734 o->field.next_hash_offset = htole64(offset);
735 }
736
737 f->field_hash_table[h].tail_hash_offset = htole64(offset);
738
739 if (JOURNAL_HEADER_CONTAINS(f->header, n_fields))
740 f->header->n_fields = htole64(le64toh(f->header->n_fields) + 1);
741
742 return 0;
743}
744
745static int journal_file_link_data(
746 JournalFile *f,
747 Object *o,
748 uint64_t offset,
749 uint64_t hash) {
750
805d1486 751 uint64_t p, h, m;
cec736d2
LP
752 int r;
753
754 assert(f);
755 assert(o);
756 assert(offset > 0);
b588975f
LP
757
758 if (o->object.type != OBJECT_DATA)
759 return -EINVAL;
cec736d2 760
805d1486
LP
761 m = le64toh(f->header->data_hash_table_size) / sizeof(HashItem);
762 if (m <= 0)
763 return -EBADMSG;
48496df6 764
805d1486 765 /* This might alter the window we are looking at */
de190aef
LP
766 o->data.next_hash_offset = o->data.next_field_offset = 0;
767 o->data.entry_offset = o->data.entry_array_offset = 0;
768 o->data.n_entries = 0;
cec736d2 769
805d1486 770 h = hash % m;
8db4213e 771 p = le64toh(f->data_hash_table[h].tail_hash_offset);
3c1668da 772 if (p == 0)
cec736d2 773 /* Only entry in the hash table is easy */
de190aef 774 f->data_hash_table[h].head_hash_offset = htole64(offset);
3c1668da 775 else {
48496df6
LP
776 /* Move back to the previous data object, to patch in
777 * pointer */
cec736d2 778
de190aef 779 r = journal_file_move_to_object(f, OBJECT_DATA, p, &o);
cec736d2
LP
780 if (r < 0)
781 return r;
782
de190aef 783 o->data.next_hash_offset = htole64(offset);
cec736d2
LP
784 }
785
de190aef 786 f->data_hash_table[h].tail_hash_offset = htole64(offset);
cec736d2 787
dca6219e
LP
788 if (JOURNAL_HEADER_CONTAINS(f->header, n_data))
789 f->header->n_data = htole64(le64toh(f->header->n_data) + 1);
790
cec736d2
LP
791 return 0;
792}
793
3c1668da
LP
794int journal_file_find_field_object_with_hash(
795 JournalFile *f,
796 const void *field, uint64_t size, uint64_t hash,
797 Object **ret, uint64_t *offset) {
798
805d1486 799 uint64_t p, osize, h, m;
3c1668da
LP
800 int r;
801
802 assert(f);
803 assert(field && size > 0);
804
805 osize = offsetof(Object, field.payload) + size;
806
805d1486
LP
807 m = le64toh(f->header->field_hash_table_size) / sizeof(HashItem);
808
809 if (m <= 0)
3c1668da
LP
810 return -EBADMSG;
811
805d1486 812 h = hash % m;
3c1668da
LP
813 p = le64toh(f->field_hash_table[h].head_hash_offset);
814
815 while (p > 0) {
816 Object *o;
817
818 r = journal_file_move_to_object(f, OBJECT_FIELD, p, &o);
819 if (r < 0)
820 return r;
821
822 if (le64toh(o->field.hash) == hash &&
823 le64toh(o->object.size) == osize &&
824 memcmp(o->field.payload, field, size) == 0) {
825
826 if (ret)
827 *ret = o;
828 if (offset)
829 *offset = p;
830
831 return 1;
832 }
833
834 p = le64toh(o->field.next_hash_offset);
835 }
836
837 return 0;
838}
839
840int journal_file_find_field_object(
841 JournalFile *f,
842 const void *field, uint64_t size,
843 Object **ret, uint64_t *offset) {
844
845 uint64_t hash;
846
847 assert(f);
848 assert(field && size > 0);
849
850 hash = hash64(field, size);
851
852 return journal_file_find_field_object_with_hash(f,
853 field, size, hash,
854 ret, offset);
855}
856
de190aef
LP
857int journal_file_find_data_object_with_hash(
858 JournalFile *f,
859 const void *data, uint64_t size, uint64_t hash,
860 Object **ret, uint64_t *offset) {
48496df6 861
805d1486 862 uint64_t p, osize, h, m;
cec736d2
LP
863 int r;
864
865 assert(f);
866 assert(data || size == 0);
867
868 osize = offsetof(Object, data.payload) + size;
869
805d1486
LP
870 m = le64toh(f->header->data_hash_table_size) / sizeof(HashItem);
871 if (m <= 0)
bc85bfee
LP
872 return -EBADMSG;
873
805d1486 874 h = hash % m;
de190aef 875 p = le64toh(f->data_hash_table[h].head_hash_offset);
cec736d2 876
de190aef
LP
877 while (p > 0) {
878 Object *o;
cec736d2 879
de190aef 880 r = journal_file_move_to_object(f, OBJECT_DATA, p, &o);
cec736d2
LP
881 if (r < 0)
882 return r;
883
807e17f0 884 if (le64toh(o->data.hash) != hash)
85a131e8 885 goto next;
807e17f0 886
d89c8fdf 887 if (o->object.flags & OBJECT_COMPRESSION_MASK) {
3b1a55e1 888#if defined(HAVE_XZ) || defined(HAVE_LZ4)
fa1c4b51 889 uint64_t l;
a7f7d1bd 890 size_t rsize = 0;
cec736d2 891
807e17f0
LP
892 l = le64toh(o->object.size);
893 if (l <= offsetof(Object, data.payload))
cec736d2
LP
894 return -EBADMSG;
895
807e17f0
LP
896 l -= offsetof(Object, data.payload);
897
d89c8fdf
ZJS
898 r = decompress_blob(o->object.flags & OBJECT_COMPRESSION_MASK,
899 o->data.payload, l, &f->compress_buffer, &f->compress_buffer_size, &rsize, 0);
900 if (r < 0)
901 return r;
807e17f0 902
b785c858 903 if (rsize == size &&
807e17f0
LP
904 memcmp(f->compress_buffer, data, size) == 0) {
905
906 if (ret)
907 *ret = o;
908
909 if (offset)
910 *offset = p;
911
912 return 1;
913 }
3b1a55e1
ZJS
914#else
915 return -EPROTONOSUPPORT;
916#endif
807e17f0
LP
917 } else if (le64toh(o->object.size) == osize &&
918 memcmp(o->data.payload, data, size) == 0) {
919
cec736d2
LP
920 if (ret)
921 *ret = o;
922
923 if (offset)
924 *offset = p;
925
de190aef 926 return 1;
cec736d2
LP
927 }
928
85a131e8 929 next:
cec736d2
LP
930 p = le64toh(o->data.next_hash_offset);
931 }
932
de190aef
LP
933 return 0;
934}
935
936int journal_file_find_data_object(
937 JournalFile *f,
938 const void *data, uint64_t size,
939 Object **ret, uint64_t *offset) {
940
941 uint64_t hash;
942
943 assert(f);
944 assert(data || size == 0);
945
946 hash = hash64(data, size);
947
948 return journal_file_find_data_object_with_hash(f,
949 data, size, hash,
950 ret, offset);
951}
952
3c1668da
LP
953static int journal_file_append_field(
954 JournalFile *f,
955 const void *field, uint64_t size,
956 Object **ret, uint64_t *offset) {
957
958 uint64_t hash, p;
959 uint64_t osize;
960 Object *o;
961 int r;
962
963 assert(f);
964 assert(field && size > 0);
965
966 hash = hash64(field, size);
967
968 r = journal_file_find_field_object_with_hash(f, field, size, hash, &o, &p);
969 if (r < 0)
970 return r;
971 else if (r > 0) {
972
973 if (ret)
974 *ret = o;
975
976 if (offset)
977 *offset = p;
978
979 return 0;
980 }
981
982 osize = offsetof(Object, field.payload) + size;
983 r = journal_file_append_object(f, OBJECT_FIELD, osize, &o, &p);
8c92d4bb
LP
984 if (r < 0)
985 return r;
3c1668da
LP
986
987 o->field.hash = htole64(hash);
988 memcpy(o->field.payload, field, size);
989
990 r = journal_file_link_field(f, o, p, hash);
991 if (r < 0)
992 return r;
993
994 /* The linking might have altered the window, so let's
995 * refresh our pointer */
996 r = journal_file_move_to_object(f, OBJECT_FIELD, p, &o);
997 if (r < 0)
998 return r;
999
1000#ifdef HAVE_GCRYPT
1001 r = journal_file_hmac_put_object(f, OBJECT_FIELD, o, p);
1002 if (r < 0)
1003 return r;
1004#endif
1005
1006 if (ret)
1007 *ret = o;
1008
1009 if (offset)
1010 *offset = p;
1011
1012 return 0;
1013}
1014
48496df6
LP
1015static int journal_file_append_data(
1016 JournalFile *f,
1017 const void *data, uint64_t size,
1018 Object **ret, uint64_t *offset) {
1019
de190aef
LP
1020 uint64_t hash, p;
1021 uint64_t osize;
1022 Object *o;
d89c8fdf 1023 int r, compression = 0;
3c1668da 1024 const void *eq;
de190aef
LP
1025
1026 assert(f);
1027 assert(data || size == 0);
1028
1029 hash = hash64(data, size);
1030
1031 r = journal_file_find_data_object_with_hash(f, data, size, hash, &o, &p);
1032 if (r < 0)
1033 return r;
1034 else if (r > 0) {
1035
1036 if (ret)
1037 *ret = o;
1038
1039 if (offset)
1040 *offset = p;
1041
1042 return 0;
1043 }
1044
1045 osize = offsetof(Object, data.payload) + size;
1046 r = journal_file_append_object(f, OBJECT_DATA, osize, &o, &p);
cec736d2
LP
1047 if (r < 0)
1048 return r;
1049
cec736d2 1050 o->data.hash = htole64(hash);
807e17f0 1051
d89c8fdf
ZJS
1052#if defined(HAVE_XZ) || defined(HAVE_LZ4)
1053 if (f->compress_xz &&
807e17f0 1054 size >= COMPRESSION_SIZE_THRESHOLD) {
a7f7d1bd 1055 size_t rsize = 0;
807e17f0 1056
d89c8fdf 1057 compression = compress_blob(data, size, o->data.payload, &rsize);
807e17f0 1058
d89c8fdf 1059 if (compression) {
807e17f0 1060 o->object.size = htole64(offsetof(Object, data.payload) + rsize);
d89c8fdf 1061 o->object.flags |= compression;
807e17f0 1062
fa1c4b51 1063 log_debug("Compressed data object %"PRIu64" -> %zu using %s",
d89c8fdf 1064 size, rsize, object_compressed_to_string(compression));
807e17f0
LP
1065 }
1066 }
1067#endif
1068
d89c8fdf 1069 if (!compression && size > 0)
807e17f0 1070 memcpy(o->data.payload, data, size);
cec736d2 1071
de190aef 1072 r = journal_file_link_data(f, o, p, hash);
cec736d2
LP
1073 if (r < 0)
1074 return r;
1075
48496df6
LP
1076 /* The linking might have altered the window, so let's
1077 * refresh our pointer */
1078 r = journal_file_move_to_object(f, OBJECT_DATA, p, &o);
1079 if (r < 0)
1080 return r;
1081
08c6f819
SL
1082 if (!data)
1083 eq = NULL;
1084 else
1085 eq = memchr(data, '=', size);
3c1668da 1086 if (eq && eq > data) {
748db592 1087 Object *fo = NULL;
3c1668da 1088 uint64_t fp;
3c1668da
LP
1089
1090 /* Create field object ... */
1091 r = journal_file_append_field(f, data, (uint8_t*) eq - (uint8_t*) data, &fo, &fp);
1092 if (r < 0)
1093 return r;
1094
1095 /* ... and link it in. */
1096 o->data.next_field_offset = fo->field.head_data_offset;
1097 fo->field.head_data_offset = le64toh(p);
1098 }
1099
5996c7c2
LP
1100#ifdef HAVE_GCRYPT
1101 r = journal_file_hmac_put_object(f, OBJECT_DATA, o, p);
1102 if (r < 0)
1103 return r;
1104#endif
1105
cec736d2
LP
1106 if (ret)
1107 *ret = o;
1108
1109 if (offset)
de190aef 1110 *offset = p;
cec736d2
LP
1111
1112 return 0;
1113}
1114
1115uint64_t journal_file_entry_n_items(Object *o) {
1116 assert(o);
b588975f
LP
1117
1118 if (o->object.type != OBJECT_ENTRY)
1119 return 0;
cec736d2
LP
1120
1121 return (le64toh(o->object.size) - offsetof(Object, entry.items)) / sizeof(EntryItem);
1122}
1123
0284adc6 1124uint64_t journal_file_entry_array_n_items(Object *o) {
de190aef 1125 assert(o);
b588975f
LP
1126
1127 if (o->object.type != OBJECT_ENTRY_ARRAY)
1128 return 0;
de190aef
LP
1129
1130 return (le64toh(o->object.size) - offsetof(Object, entry_array.items)) / sizeof(uint64_t);
1131}
1132
fb9a24b6
LP
1133uint64_t journal_file_hash_table_n_items(Object *o) {
1134 assert(o);
b588975f
LP
1135
1136 if (o->object.type != OBJECT_DATA_HASH_TABLE &&
1137 o->object.type != OBJECT_FIELD_HASH_TABLE)
1138 return 0;
fb9a24b6
LP
1139
1140 return (le64toh(o->object.size) - offsetof(Object, hash_table.items)) / sizeof(HashItem);
1141}
1142
de190aef 1143static int link_entry_into_array(JournalFile *f,
4fd052ae
FC
1144 le64_t *first,
1145 le64_t *idx,
de190aef 1146 uint64_t p) {
cec736d2 1147 int r;
de190aef
LP
1148 uint64_t n = 0, ap = 0, q, i, a, hidx;
1149 Object *o;
1150
cec736d2 1151 assert(f);
de190aef
LP
1152 assert(first);
1153 assert(idx);
1154 assert(p > 0);
cec736d2 1155
de190aef
LP
1156 a = le64toh(*first);
1157 i = hidx = le64toh(*idx);
1158 while (a > 0) {
1159
1160 r = journal_file_move_to_object(f, OBJECT_ENTRY_ARRAY, a, &o);
1161 if (r < 0)
1162 return r;
cec736d2 1163
de190aef
LP
1164 n = journal_file_entry_array_n_items(o);
1165 if (i < n) {
1166 o->entry_array.items[i] = htole64(p);
1167 *idx = htole64(hidx + 1);
1168 return 0;
1169 }
cec736d2 1170
de190aef
LP
1171 i -= n;
1172 ap = a;
1173 a = le64toh(o->entry_array.next_entry_array_offset);
1174 }
1175
1176 if (hidx > n)
1177 n = (hidx+1) * 2;
1178 else
1179 n = n * 2;
1180
1181 if (n < 4)
1182 n = 4;
1183
1184 r = journal_file_append_object(f, OBJECT_ENTRY_ARRAY,
1185 offsetof(Object, entry_array.items) + n * sizeof(uint64_t),
1186 &o, &q);
cec736d2
LP
1187 if (r < 0)
1188 return r;
1189
feb12d3e 1190#ifdef HAVE_GCRYPT
5996c7c2 1191 r = journal_file_hmac_put_object(f, OBJECT_ENTRY_ARRAY, o, q);
b0af6f41
LP
1192 if (r < 0)
1193 return r;
feb12d3e 1194#endif
b0af6f41 1195
de190aef 1196 o->entry_array.items[i] = htole64(p);
cec736d2 1197
de190aef 1198 if (ap == 0)
7be3aa17 1199 *first = htole64(q);
cec736d2 1200 else {
de190aef 1201 r = journal_file_move_to_object(f, OBJECT_ENTRY_ARRAY, ap, &o);
cec736d2
LP
1202 if (r < 0)
1203 return r;
1204
de190aef
LP
1205 o->entry_array.next_entry_array_offset = htole64(q);
1206 }
cec736d2 1207
2dee23eb
LP
1208 if (JOURNAL_HEADER_CONTAINS(f->header, n_entry_arrays))
1209 f->header->n_entry_arrays = htole64(le64toh(f->header->n_entry_arrays) + 1);
1210
de190aef
LP
1211 *idx = htole64(hidx + 1);
1212
1213 return 0;
1214}
cec736d2 1215
de190aef 1216static int link_entry_into_array_plus_one(JournalFile *f,
4fd052ae
FC
1217 le64_t *extra,
1218 le64_t *first,
1219 le64_t *idx,
de190aef
LP
1220 uint64_t p) {
1221
1222 int r;
1223
1224 assert(f);
1225 assert(extra);
1226 assert(first);
1227 assert(idx);
1228 assert(p > 0);
1229
1230 if (*idx == 0)
1231 *extra = htole64(p);
1232 else {
4fd052ae 1233 le64_t i;
de190aef 1234
7be3aa17 1235 i = htole64(le64toh(*idx) - 1);
de190aef
LP
1236 r = link_entry_into_array(f, first, &i, p);
1237 if (r < 0)
1238 return r;
cec736d2
LP
1239 }
1240
de190aef
LP
1241 *idx = htole64(le64toh(*idx) + 1);
1242 return 0;
1243}
1244
1245static int journal_file_link_entry_item(JournalFile *f, Object *o, uint64_t offset, uint64_t i) {
1246 uint64_t p;
1247 int r;
1248 assert(f);
1249 assert(o);
1250 assert(offset > 0);
1251
1252 p = le64toh(o->entry.items[i].object_offset);
1253 if (p == 0)
1254 return -EINVAL;
1255
1256 r = journal_file_move_to_object(f, OBJECT_DATA, p, &o);
cec736d2
LP
1257 if (r < 0)
1258 return r;
1259
de190aef
LP
1260 return link_entry_into_array_plus_one(f,
1261 &o->data.entry_offset,
1262 &o->data.entry_array_offset,
1263 &o->data.n_entries,
1264 offset);
cec736d2
LP
1265}
1266
1267static int journal_file_link_entry(JournalFile *f, Object *o, uint64_t offset) {
de190aef 1268 uint64_t n, i;
cec736d2
LP
1269 int r;
1270
1271 assert(f);
1272 assert(o);
1273 assert(offset > 0);
b588975f
LP
1274
1275 if (o->object.type != OBJECT_ENTRY)
1276 return -EINVAL;
cec736d2 1277
b788cc23
LP
1278 __sync_synchronize();
1279
cec736d2 1280 /* Link up the entry itself */
de190aef
LP
1281 r = link_entry_into_array(f,
1282 &f->header->entry_array_offset,
1283 &f->header->n_entries,
1284 offset);
1285 if (r < 0)
1286 return r;
cec736d2 1287
507f22bd 1288 /* log_debug("=> %s seqnr=%"PRIu64" n_entries=%"PRIu64, f->path, o->entry.seqnum, f->header->n_entries); */
cec736d2 1289
de190aef 1290 if (f->header->head_entry_realtime == 0)
0ac38b70 1291 f->header->head_entry_realtime = o->entry.realtime;
cec736d2 1292
0ac38b70 1293 f->header->tail_entry_realtime = o->entry.realtime;
de190aef
LP
1294 f->header->tail_entry_monotonic = o->entry.monotonic;
1295
1296 f->tail_entry_monotonic_valid = true;
cec736d2
LP
1297
1298 /* Link up the items */
1299 n = journal_file_entry_n_items(o);
1300 for (i = 0; i < n; i++) {
1301 r = journal_file_link_entry_item(f, o, offset, i);
1302 if (r < 0)
1303 return r;
1304 }
1305
cec736d2
LP
1306 return 0;
1307}
1308
1309static int journal_file_append_entry_internal(
1310 JournalFile *f,
1311 const dual_timestamp *ts,
1312 uint64_t xor_hash,
1313 const EntryItem items[], unsigned n_items,
de190aef 1314 uint64_t *seqnum,
cec736d2
LP
1315 Object **ret, uint64_t *offset) {
1316 uint64_t np;
1317 uint64_t osize;
1318 Object *o;
1319 int r;
1320
1321 assert(f);
1322 assert(items || n_items == 0);
de190aef 1323 assert(ts);
cec736d2
LP
1324
1325 osize = offsetof(Object, entry.items) + (n_items * sizeof(EntryItem));
1326
de190aef 1327 r = journal_file_append_object(f, OBJECT_ENTRY, osize, &o, &np);
cec736d2
LP
1328 if (r < 0)
1329 return r;
1330
d98cc1f2 1331 o->entry.seqnum = htole64(journal_file_entry_seqnum(f, seqnum));
cec736d2 1332 memcpy(o->entry.items, items, n_items * sizeof(EntryItem));
de190aef
LP
1333 o->entry.realtime = htole64(ts->realtime);
1334 o->entry.monotonic = htole64(ts->monotonic);
cec736d2
LP
1335 o->entry.xor_hash = htole64(xor_hash);
1336 o->entry.boot_id = f->header->boot_id;
1337
feb12d3e 1338#ifdef HAVE_GCRYPT
5996c7c2 1339 r = journal_file_hmac_put_object(f, OBJECT_ENTRY, o, np);
b0af6f41
LP
1340 if (r < 0)
1341 return r;
feb12d3e 1342#endif
b0af6f41 1343
cec736d2
LP
1344 r = journal_file_link_entry(f, o, np);
1345 if (r < 0)
1346 return r;
1347
1348 if (ret)
1349 *ret = o;
1350
1351 if (offset)
1352 *offset = np;
1353
1354 return 0;
1355}
1356
cf244689 1357void journal_file_post_change(JournalFile *f) {
50f20cfd
LP
1358 assert(f);
1359
1360 /* inotify() does not receive IN_MODIFY events from file
1361 * accesses done via mmap(). After each access we hence
1362 * trigger IN_MODIFY by truncating the journal file to its
1363 * current size which triggers IN_MODIFY. */
1364
bc85bfee
LP
1365 __sync_synchronize();
1366
50f20cfd 1367 if (ftruncate(f->fd, f->last_stat.st_size) < 0)
56f64d95 1368 log_error_errno(errno, "Failed to truncate file to its own size: %m");
50f20cfd
LP
1369}
1370
1f2da9ec
LP
1371static int entry_item_cmp(const void *_a, const void *_b) {
1372 const EntryItem *a = _a, *b = _b;
1373
1374 if (le64toh(a->object_offset) < le64toh(b->object_offset))
1375 return -1;
1376 if (le64toh(a->object_offset) > le64toh(b->object_offset))
1377 return 1;
1378 return 0;
1379}
1380
de190aef 1381int journal_file_append_entry(JournalFile *f, const dual_timestamp *ts, const struct iovec iovec[], unsigned n_iovec, uint64_t *seqnum, Object **ret, uint64_t *offset) {
cec736d2
LP
1382 unsigned i;
1383 EntryItem *items;
1384 int r;
1385 uint64_t xor_hash = 0;
de190aef 1386 struct dual_timestamp _ts;
cec736d2
LP
1387
1388 assert(f);
1389 assert(iovec || n_iovec == 0);
1390
de190aef
LP
1391 if (!ts) {
1392 dual_timestamp_get(&_ts);
1393 ts = &_ts;
1394 }
1395
1396 if (f->tail_entry_monotonic_valid &&
1397 ts->monotonic < le64toh(f->header->tail_entry_monotonic))
1398 return -EINVAL;
1399
feb12d3e 1400#ifdef HAVE_GCRYPT
7560fffc
LP
1401 r = journal_file_maybe_append_tag(f, ts->realtime);
1402 if (r < 0)
1403 return r;
feb12d3e 1404#endif
7560fffc 1405
64825d3c 1406 /* alloca() can't take 0, hence let's allocate at least one */
9607d947 1407 items = alloca(sizeof(EntryItem) * MAX(1u, n_iovec));
cec736d2
LP
1408
1409 for (i = 0; i < n_iovec; i++) {
1410 uint64_t p;
1411 Object *o;
1412
1413 r = journal_file_append_data(f, iovec[i].iov_base, iovec[i].iov_len, &o, &p);
1414 if (r < 0)
cf244689 1415 return r;
cec736d2
LP
1416
1417 xor_hash ^= le64toh(o->data.hash);
1418 items[i].object_offset = htole64(p);
de7b95cd 1419 items[i].hash = o->data.hash;
cec736d2
LP
1420 }
1421
1f2da9ec
LP
1422 /* Order by the position on disk, in order to improve seek
1423 * times for rotating media. */
7ff7394d 1424 qsort_safe(items, n_iovec, sizeof(EntryItem), entry_item_cmp);
1f2da9ec 1425
de190aef 1426 r = journal_file_append_entry_internal(f, ts, xor_hash, items, n_iovec, seqnum, ret, offset);
cec736d2 1427
fa6ac760
LP
1428 /* If the memory mapping triggered a SIGBUS then we return an
1429 * IO error and ignore the error code passed down to us, since
1430 * it is very likely just an effect of a nullified replacement
1431 * mapping page */
1432
1433 if (mmap_cache_got_sigbus(f->mmap, f->fd))
1434 r = -EIO;
1435
50f20cfd
LP
1436 journal_file_post_change(f);
1437
cec736d2
LP
1438 return r;
1439}
1440
a4bcff5b 1441typedef struct ChainCacheItem {
fb099c8d 1442 uint64_t first; /* the array at the beginning of the chain */
a4bcff5b
LP
1443 uint64_t array; /* the cached array */
1444 uint64_t begin; /* the first item in the cached array */
1445 uint64_t total; /* the total number of items in all arrays before this one in the chain */
f268980d 1446 uint64_t last_index; /* the last index we looked at, to optimize locality when bisecting */
a4bcff5b
LP
1447} ChainCacheItem;
1448
1449static void chain_cache_put(
4743015d 1450 OrderedHashmap *h,
a4bcff5b
LP
1451 ChainCacheItem *ci,
1452 uint64_t first,
1453 uint64_t array,
1454 uint64_t begin,
f268980d
LP
1455 uint64_t total,
1456 uint64_t last_index) {
a4bcff5b
LP
1457
1458 if (!ci) {
34741aa3
LP
1459 /* If the chain item to cache for this chain is the
1460 * first one it's not worth caching anything */
1461 if (array == first)
1462 return;
1463
29433089 1464 if (ordered_hashmap_size(h) >= CHAIN_CACHE_MAX) {
4743015d 1465 ci = ordered_hashmap_steal_first(h);
29433089
LP
1466 assert(ci);
1467 } else {
a4bcff5b
LP
1468 ci = new(ChainCacheItem, 1);
1469 if (!ci)
1470 return;
1471 }
1472
1473 ci->first = first;
1474
4743015d 1475 if (ordered_hashmap_put(h, &ci->first, ci) < 0) {
a4bcff5b
LP
1476 free(ci);
1477 return;
1478 }
1479 } else
1480 assert(ci->first == first);
1481
1482 ci->array = array;
1483 ci->begin = begin;
1484 ci->total = total;
f268980d 1485 ci->last_index = last_index;
a4bcff5b
LP
1486}
1487
f268980d
LP
1488static int generic_array_get(
1489 JournalFile *f,
1490 uint64_t first,
1491 uint64_t i,
1492 Object **ret, uint64_t *offset) {
de190aef 1493
cec736d2 1494 Object *o;
a4bcff5b 1495 uint64_t p = 0, a, t = 0;
cec736d2 1496 int r;
a4bcff5b 1497 ChainCacheItem *ci;
cec736d2
LP
1498
1499 assert(f);
1500
de190aef 1501 a = first;
a4bcff5b
LP
1502
1503 /* Try the chain cache first */
4743015d 1504 ci = ordered_hashmap_get(f->chain_cache, &first);
a4bcff5b
LP
1505 if (ci && i > ci->total) {
1506 a = ci->array;
1507 i -= ci->total;
1508 t = ci->total;
1509 }
1510
de190aef 1511 while (a > 0) {
a4bcff5b 1512 uint64_t k;
cec736d2 1513
de190aef
LP
1514 r = journal_file_move_to_object(f, OBJECT_ENTRY_ARRAY, a, &o);
1515 if (r < 0)
1516 return r;
cec736d2 1517
a4bcff5b
LP
1518 k = journal_file_entry_array_n_items(o);
1519 if (i < k) {
de190aef 1520 p = le64toh(o->entry_array.items[i]);
a4bcff5b 1521 goto found;
cec736d2
LP
1522 }
1523
a4bcff5b
LP
1524 i -= k;
1525 t += k;
de190aef
LP
1526 a = le64toh(o->entry_array.next_entry_array_offset);
1527 }
1528
a4bcff5b
LP
1529 return 0;
1530
1531found:
1532 /* Let's cache this item for the next invocation */
af13a6b0 1533 chain_cache_put(f->chain_cache, ci, first, a, le64toh(o->entry_array.items[0]), t, i);
de190aef
LP
1534
1535 r = journal_file_move_to_object(f, OBJECT_ENTRY, p, &o);
1536 if (r < 0)
1537 return r;
1538
1539 if (ret)
1540 *ret = o;
1541
1542 if (offset)
1543 *offset = p;
1544
1545 return 1;
1546}
1547
f268980d
LP
1548static int generic_array_get_plus_one(
1549 JournalFile *f,
1550 uint64_t extra,
1551 uint64_t first,
1552 uint64_t i,
1553 Object **ret, uint64_t *offset) {
de190aef
LP
1554
1555 Object *o;
1556
1557 assert(f);
1558
1559 if (i == 0) {
1560 int r;
1561
1562 r = journal_file_move_to_object(f, OBJECT_ENTRY, extra, &o);
cec736d2
LP
1563 if (r < 0)
1564 return r;
1565
de190aef
LP
1566 if (ret)
1567 *ret = o;
cec736d2 1568
de190aef
LP
1569 if (offset)
1570 *offset = extra;
cec736d2 1571
de190aef 1572 return 1;
cec736d2
LP
1573 }
1574
de190aef
LP
1575 return generic_array_get(f, first, i-1, ret, offset);
1576}
cec736d2 1577
de190aef
LP
1578enum {
1579 TEST_FOUND,
1580 TEST_LEFT,
1581 TEST_RIGHT
1582};
cec736d2 1583
f268980d
LP
1584static int generic_array_bisect(
1585 JournalFile *f,
1586 uint64_t first,
1587 uint64_t n,
1588 uint64_t needle,
1589 int (*test_object)(JournalFile *f, uint64_t p, uint64_t needle),
1590 direction_t direction,
1591 Object **ret,
1592 uint64_t *offset,
1593 uint64_t *idx) {
1594
1595 uint64_t a, p, t = 0, i = 0, last_p = 0, last_index = (uint64_t) -1;
de190aef
LP
1596 bool subtract_one = false;
1597 Object *o, *array = NULL;
1598 int r;
a4bcff5b 1599 ChainCacheItem *ci;
cec736d2 1600
de190aef
LP
1601 assert(f);
1602 assert(test_object);
cec736d2 1603
a4bcff5b 1604 /* Start with the first array in the chain */
de190aef 1605 a = first;
a4bcff5b 1606
4743015d 1607 ci = ordered_hashmap_get(f->chain_cache, &first);
a4bcff5b
LP
1608 if (ci && n > ci->total) {
1609 /* Ah, we have iterated this bisection array chain
1610 * previously! Let's see if we can skip ahead in the
1611 * chain, as far as the last time. But we can't jump
1612 * backwards in the chain, so let's check that
1613 * first. */
1614
1615 r = test_object(f, ci->begin, needle);
1616 if (r < 0)
1617 return r;
1618
1619 if (r == TEST_LEFT) {
f268980d 1620 /* OK, what we are looking for is right of the
a4bcff5b
LP
1621 * begin of this EntryArray, so let's jump
1622 * straight to previously cached array in the
1623 * chain */
1624
1625 a = ci->array;
1626 n -= ci->total;
1627 t = ci->total;
f268980d 1628 last_index = ci->last_index;
a4bcff5b
LP
1629 }
1630 }
1631
de190aef
LP
1632 while (a > 0) {
1633 uint64_t left, right, k, lp;
1634
1635 r = journal_file_move_to_object(f, OBJECT_ENTRY_ARRAY, a, &array);
cec736d2
LP
1636 if (r < 0)
1637 return r;
1638
de190aef
LP
1639 k = journal_file_entry_array_n_items(array);
1640 right = MIN(k, n);
1641 if (right <= 0)
1642 return 0;
cec736d2 1643
de190aef
LP
1644 i = right - 1;
1645 lp = p = le64toh(array->entry_array.items[i]);
1646 if (p <= 0)
1647 return -EBADMSG;
cec736d2 1648
de190aef
LP
1649 r = test_object(f, p, needle);
1650 if (r < 0)
1651 return r;
cec736d2 1652
de190aef
LP
1653 if (r == TEST_FOUND)
1654 r = direction == DIRECTION_DOWN ? TEST_RIGHT : TEST_LEFT;
1655
1656 if (r == TEST_RIGHT) {
1657 left = 0;
1658 right -= 1;
f268980d
LP
1659
1660 if (last_index != (uint64_t) -1) {
1661 assert(last_index <= right);
1662
1663 /* If we cached the last index we
1664 * looked at, let's try to not to jump
1665 * too wildly around and see if we can
1666 * limit the range to look at early to
1667 * the immediate neighbors of the last
1668 * index we looked at. */
1669
1670 if (last_index > 0) {
1671 uint64_t x = last_index - 1;
1672
1673 p = le64toh(array->entry_array.items[x]);
1674 if (p <= 0)
1675 return -EBADMSG;
1676
1677 r = test_object(f, p, needle);
1678 if (r < 0)
1679 return r;
1680
1681 if (r == TEST_FOUND)
1682 r = direction == DIRECTION_DOWN ? TEST_RIGHT : TEST_LEFT;
1683
1684 if (r == TEST_RIGHT)
1685 right = x;
1686 else
1687 left = x + 1;
1688 }
1689
1690 if (last_index < right) {
1691 uint64_t y = last_index + 1;
1692
1693 p = le64toh(array->entry_array.items[y]);
1694 if (p <= 0)
1695 return -EBADMSG;
1696
1697 r = test_object(f, p, needle);
1698 if (r < 0)
1699 return r;
1700
1701 if (r == TEST_FOUND)
1702 r = direction == DIRECTION_DOWN ? TEST_RIGHT : TEST_LEFT;
1703
1704 if (r == TEST_RIGHT)
1705 right = y;
1706 else
1707 left = y + 1;
1708 }
f268980d
LP
1709 }
1710
de190aef
LP
1711 for (;;) {
1712 if (left == right) {
1713 if (direction == DIRECTION_UP)
1714 subtract_one = true;
1715
1716 i = left;
1717 goto found;
1718 }
1719
1720 assert(left < right);
de190aef 1721 i = (left + right) / 2;
f268980d 1722
de190aef
LP
1723 p = le64toh(array->entry_array.items[i]);
1724 if (p <= 0)
1725 return -EBADMSG;
1726
1727 r = test_object(f, p, needle);
1728 if (r < 0)
1729 return r;
cec736d2 1730
de190aef
LP
1731 if (r == TEST_FOUND)
1732 r = direction == DIRECTION_DOWN ? TEST_RIGHT : TEST_LEFT;
1733
1734 if (r == TEST_RIGHT)
1735 right = i;
1736 else
1737 left = i + 1;
1738 }
1739 }
1740
2173cbf8 1741 if (k >= n) {
cbdca852
LP
1742 if (direction == DIRECTION_UP) {
1743 i = n;
1744 subtract_one = true;
1745 goto found;
1746 }
1747
cec736d2 1748 return 0;
cbdca852 1749 }
cec736d2 1750
de190aef
LP
1751 last_p = lp;
1752
1753 n -= k;
1754 t += k;
f268980d 1755 last_index = (uint64_t) -1;
de190aef 1756 a = le64toh(array->entry_array.next_entry_array_offset);
cec736d2
LP
1757 }
1758
1759 return 0;
de190aef
LP
1760
1761found:
1762 if (subtract_one && t == 0 && i == 0)
1763 return 0;
1764
a4bcff5b 1765 /* Let's cache this item for the next invocation */
af13a6b0 1766 chain_cache_put(f->chain_cache, ci, first, a, le64toh(array->entry_array.items[0]), t, subtract_one ? (i > 0 ? i-1 : (uint64_t) -1) : i);
a4bcff5b 1767
de190aef
LP
1768 if (subtract_one && i == 0)
1769 p = last_p;
1770 else if (subtract_one)
1771 p = le64toh(array->entry_array.items[i-1]);
1772 else
1773 p = le64toh(array->entry_array.items[i]);
1774
1775 r = journal_file_move_to_object(f, OBJECT_ENTRY, p, &o);
1776 if (r < 0)
1777 return r;
1778
1779 if (ret)
1780 *ret = o;
1781
1782 if (offset)
1783 *offset = p;
1784
1785 if (idx)
cbdca852 1786 *idx = t + i + (subtract_one ? -1 : 0);
de190aef
LP
1787
1788 return 1;
cec736d2
LP
1789}
1790
f268980d
LP
1791static int generic_array_bisect_plus_one(
1792 JournalFile *f,
1793 uint64_t extra,
1794 uint64_t first,
1795 uint64_t n,
1796 uint64_t needle,
1797 int (*test_object)(JournalFile *f, uint64_t p, uint64_t needle),
1798 direction_t direction,
1799 Object **ret,
1800 uint64_t *offset,
1801 uint64_t *idx) {
de190aef 1802
cec736d2 1803 int r;
cbdca852
LP
1804 bool step_back = false;
1805 Object *o;
cec736d2
LP
1806
1807 assert(f);
de190aef 1808 assert(test_object);
cec736d2 1809
de190aef
LP
1810 if (n <= 0)
1811 return 0;
cec736d2 1812
de190aef
LP
1813 /* This bisects the array in object 'first', but first checks
1814 * an extra */
de190aef
LP
1815 r = test_object(f, extra, needle);
1816 if (r < 0)
1817 return r;
a536e261
LP
1818
1819 if (r == TEST_FOUND)
1820 r = direction == DIRECTION_DOWN ? TEST_RIGHT : TEST_LEFT;
1821
cbdca852
LP
1822 /* if we are looking with DIRECTION_UP then we need to first
1823 see if in the actual array there is a matching entry, and
1824 return the last one of that. But if there isn't any we need
1825 to return this one. Hence remember this, and return it
1826 below. */
1827 if (r == TEST_LEFT)
1828 step_back = direction == DIRECTION_UP;
de190aef 1829
cbdca852
LP
1830 if (r == TEST_RIGHT) {
1831 if (direction == DIRECTION_DOWN)
1832 goto found;
1833 else
1834 return 0;
a536e261 1835 }
cec736d2 1836
de190aef
LP
1837 r = generic_array_bisect(f, first, n-1, needle, test_object, direction, ret, offset, idx);
1838
cbdca852
LP
1839 if (r == 0 && step_back)
1840 goto found;
1841
ecf68b1d 1842 if (r > 0 && idx)
de190aef
LP
1843 (*idx) ++;
1844
1845 return r;
cbdca852
LP
1846
1847found:
1848 r = journal_file_move_to_object(f, OBJECT_ENTRY, extra, &o);
1849 if (r < 0)
1850 return r;
1851
1852 if (ret)
1853 *ret = o;
1854
1855 if (offset)
1856 *offset = extra;
1857
1858 if (idx)
1859 *idx = 0;
1860
1861 return 1;
1862}
1863
44a6b1b6 1864_pure_ static int test_object_offset(JournalFile *f, uint64_t p, uint64_t needle) {
cbdca852
LP
1865 assert(f);
1866 assert(p > 0);
1867
1868 if (p == needle)
1869 return TEST_FOUND;
1870 else if (p < needle)
1871 return TEST_LEFT;
1872 else
1873 return TEST_RIGHT;
1874}
1875
de190aef
LP
1876static int test_object_seqnum(JournalFile *f, uint64_t p, uint64_t needle) {
1877 Object *o;
1878 int r;
1879
1880 assert(f);
1881 assert(p > 0);
1882
1883 r = journal_file_move_to_object(f, OBJECT_ENTRY, p, &o);
cec736d2
LP
1884 if (r < 0)
1885 return r;
1886
de190aef
LP
1887 if (le64toh(o->entry.seqnum) == needle)
1888 return TEST_FOUND;
1889 else if (le64toh(o->entry.seqnum) < needle)
1890 return TEST_LEFT;
1891 else
1892 return TEST_RIGHT;
1893}
cec736d2 1894
de190aef
LP
1895int journal_file_move_to_entry_by_seqnum(
1896 JournalFile *f,
1897 uint64_t seqnum,
1898 direction_t direction,
1899 Object **ret,
1900 uint64_t *offset) {
1901
1902 return generic_array_bisect(f,
1903 le64toh(f->header->entry_array_offset),
1904 le64toh(f->header->n_entries),
1905 seqnum,
1906 test_object_seqnum,
1907 direction,
1908 ret, offset, NULL);
1909}
cec736d2 1910
de190aef
LP
1911static int test_object_realtime(JournalFile *f, uint64_t p, uint64_t needle) {
1912 Object *o;
1913 int r;
1914
1915 assert(f);
1916 assert(p > 0);
1917
1918 r = journal_file_move_to_object(f, OBJECT_ENTRY, p, &o);
1919 if (r < 0)
1920 return r;
1921
1922 if (le64toh(o->entry.realtime) == needle)
1923 return TEST_FOUND;
1924 else if (le64toh(o->entry.realtime) < needle)
1925 return TEST_LEFT;
1926 else
1927 return TEST_RIGHT;
cec736d2
LP
1928}
1929
de190aef
LP
1930int journal_file_move_to_entry_by_realtime(
1931 JournalFile *f,
1932 uint64_t realtime,
1933 direction_t direction,
1934 Object **ret,
1935 uint64_t *offset) {
1936
1937 return generic_array_bisect(f,
1938 le64toh(f->header->entry_array_offset),
1939 le64toh(f->header->n_entries),
1940 realtime,
1941 test_object_realtime,
1942 direction,
1943 ret, offset, NULL);
1944}
1945
1946static int test_object_monotonic(JournalFile *f, uint64_t p, uint64_t needle) {
1947 Object *o;
1948 int r;
1949
1950 assert(f);
1951 assert(p > 0);
1952
1953 r = journal_file_move_to_object(f, OBJECT_ENTRY, p, &o);
1954 if (r < 0)
1955 return r;
1956
1957 if (le64toh(o->entry.monotonic) == needle)
1958 return TEST_FOUND;
1959 else if (le64toh(o->entry.monotonic) < needle)
1960 return TEST_LEFT;
1961 else
1962 return TEST_RIGHT;
1963}
1964
2a560338 1965static int find_data_object_by_boot_id(
47838ab3
ZJS
1966 JournalFile *f,
1967 sd_id128_t boot_id,
1968 Object **o,
1969 uint64_t *b) {
2a560338 1970
47838ab3
ZJS
1971 char t[sizeof("_BOOT_ID=")-1 + 32 + 1] = "_BOOT_ID=";
1972
1973 sd_id128_to_string(boot_id, t + 9);
1974 return journal_file_find_data_object(f, t, sizeof(t) - 1, o, b);
1975}
1976
de190aef
LP
1977int journal_file_move_to_entry_by_monotonic(
1978 JournalFile *f,
1979 sd_id128_t boot_id,
1980 uint64_t monotonic,
1981 direction_t direction,
1982 Object **ret,
1983 uint64_t *offset) {
1984
de190aef
LP
1985 Object *o;
1986 int r;
1987
cbdca852 1988 assert(f);
de190aef 1989
47838ab3 1990 r = find_data_object_by_boot_id(f, boot_id, &o, NULL);
de190aef
LP
1991 if (r < 0)
1992 return r;
cbdca852 1993 if (r == 0)
de190aef
LP
1994 return -ENOENT;
1995
1996 return generic_array_bisect_plus_one(f,
1997 le64toh(o->data.entry_offset),
1998 le64toh(o->data.entry_array_offset),
1999 le64toh(o->data.n_entries),
2000 monotonic,
2001 test_object_monotonic,
2002 direction,
2003 ret, offset, NULL);
2004}
2005
1fc605b0 2006void journal_file_reset_location(JournalFile *f) {
6573ef05 2007 f->location_type = LOCATION_HEAD;
1fc605b0 2008 f->current_offset = 0;
6573ef05
MS
2009 f->current_seqnum = 0;
2010 f->current_realtime = 0;
2011 f->current_monotonic = 0;
2012 zero(f->current_boot_id);
2013 f->current_xor_hash = 0;
2014}
2015
950c07d4 2016void journal_file_save_location(JournalFile *f, Object *o, uint64_t offset) {
6573ef05
MS
2017 f->location_type = LOCATION_SEEK;
2018 f->current_offset = offset;
2019 f->current_seqnum = le64toh(o->entry.seqnum);
2020 f->current_realtime = le64toh(o->entry.realtime);
2021 f->current_monotonic = le64toh(o->entry.monotonic);
2022 f->current_boot_id = o->entry.boot_id;
2023 f->current_xor_hash = le64toh(o->entry.xor_hash);
1fc605b0
MS
2024}
2025
d8ae66d7
MS
2026int journal_file_compare_locations(JournalFile *af, JournalFile *bf) {
2027 assert(af);
2028 assert(bf);
2029 assert(af->location_type == LOCATION_SEEK);
2030 assert(bf->location_type == LOCATION_SEEK);
2031
2032 /* If contents and timestamps match, these entries are
2033 * identical, even if the seqnum does not match */
2034 if (sd_id128_equal(af->current_boot_id, bf->current_boot_id) &&
2035 af->current_monotonic == bf->current_monotonic &&
2036 af->current_realtime == bf->current_realtime &&
2037 af->current_xor_hash == bf->current_xor_hash)
2038 return 0;
2039
2040 if (sd_id128_equal(af->header->seqnum_id, bf->header->seqnum_id)) {
2041
2042 /* If this is from the same seqnum source, compare
2043 * seqnums */
2044 if (af->current_seqnum < bf->current_seqnum)
2045 return -1;
2046 if (af->current_seqnum > bf->current_seqnum)
2047 return 1;
2048
2049 /* Wow! This is weird, different data but the same
2050 * seqnums? Something is borked, but let's make the
2051 * best of it and compare by time. */
2052 }
2053
2054 if (sd_id128_equal(af->current_boot_id, bf->current_boot_id)) {
2055
2056 /* If the boot id matches, compare monotonic time */
2057 if (af->current_monotonic < bf->current_monotonic)
2058 return -1;
2059 if (af->current_monotonic > bf->current_monotonic)
2060 return 1;
2061 }
2062
2063 /* Otherwise, compare UTC time */
2064 if (af->current_realtime < bf->current_realtime)
2065 return -1;
2066 if (af->current_realtime > bf->current_realtime)
2067 return 1;
2068
2069 /* Finally, compare by contents */
2070 if (af->current_xor_hash < bf->current_xor_hash)
2071 return -1;
2072 if (af->current_xor_hash > bf->current_xor_hash)
2073 return 1;
2074
2075 return 0;
2076}
2077
de190aef
LP
2078int journal_file_next_entry(
2079 JournalFile *f,
f534928a 2080 uint64_t p,
de190aef
LP
2081 direction_t direction,
2082 Object **ret, uint64_t *offset) {
2083
fb099c8d 2084 uint64_t i, n, ofs;
cec736d2
LP
2085 int r;
2086
2087 assert(f);
de190aef
LP
2088
2089 n = le64toh(f->header->n_entries);
2090 if (n <= 0)
2091 return 0;
cec736d2 2092
f534928a 2093 if (p == 0)
de190aef 2094 i = direction == DIRECTION_DOWN ? 0 : n - 1;
cec736d2 2095 else {
de190aef
LP
2096 r = generic_array_bisect(f,
2097 le64toh(f->header->entry_array_offset),
2098 le64toh(f->header->n_entries),
2099 p,
2100 test_object_offset,
2101 DIRECTION_DOWN,
2102 NULL, NULL,
2103 &i);
2104 if (r <= 0)
2105 return r;
2106
2107 if (direction == DIRECTION_DOWN) {
2108 if (i >= n - 1)
2109 return 0;
2110
2111 i++;
2112 } else {
2113 if (i <= 0)
2114 return 0;
2115
2116 i--;
2117 }
cec736d2
LP
2118 }
2119
de190aef 2120 /* And jump to it */
fb099c8d
ZJS
2121 r = generic_array_get(f,
2122 le64toh(f->header->entry_array_offset),
2123 i,
2124 ret, &ofs);
2125 if (r <= 0)
2126 return r;
2127
2128 if (p > 0 &&
2129 (direction == DIRECTION_DOWN ? ofs <= p : ofs >= p)) {
2130 log_debug("%s: entry array corrupted at entry %"PRIu64,
2131 f->path, i);
2132 return -EBADMSG;
2133 }
2134
2135 if (offset)
2136 *offset = ofs;
2137
2138 return 1;
de190aef 2139}
cec736d2 2140
de190aef
LP
2141int journal_file_next_entry_for_data(
2142 JournalFile *f,
2143 Object *o, uint64_t p,
2144 uint64_t data_offset,
2145 direction_t direction,
2146 Object **ret, uint64_t *offset) {
2147
2148 uint64_t n, i;
cec736d2 2149 int r;
de190aef 2150 Object *d;
cec736d2
LP
2151
2152 assert(f);
de190aef 2153 assert(p > 0 || !o);
cec736d2 2154
de190aef 2155 r = journal_file_move_to_object(f, OBJECT_DATA, data_offset, &d);
466ccd92 2156 if (r < 0)
de190aef 2157 return r;
cec736d2 2158
de190aef
LP
2159 n = le64toh(d->data.n_entries);
2160 if (n <= 0)
2161 return n;
cec736d2 2162
de190aef
LP
2163 if (!o)
2164 i = direction == DIRECTION_DOWN ? 0 : n - 1;
2165 else {
2166 if (o->object.type != OBJECT_ENTRY)
2167 return -EINVAL;
cec736d2 2168
de190aef
LP
2169 r = generic_array_bisect_plus_one(f,
2170 le64toh(d->data.entry_offset),
2171 le64toh(d->data.entry_array_offset),
2172 le64toh(d->data.n_entries),
2173 p,
2174 test_object_offset,
2175 DIRECTION_DOWN,
2176 NULL, NULL,
2177 &i);
2178
2179 if (r <= 0)
cec736d2
LP
2180 return r;
2181
de190aef
LP
2182 if (direction == DIRECTION_DOWN) {
2183 if (i >= n - 1)
2184 return 0;
cec736d2 2185
de190aef
LP
2186 i++;
2187 } else {
2188 if (i <= 0)
2189 return 0;
cec736d2 2190
de190aef
LP
2191 i--;
2192 }
cec736d2 2193
de190aef 2194 }
cec736d2 2195
de190aef
LP
2196 return generic_array_get_plus_one(f,
2197 le64toh(d->data.entry_offset),
2198 le64toh(d->data.entry_array_offset),
2199 i,
2200 ret, offset);
2201}
cec736d2 2202
cbdca852
LP
2203int journal_file_move_to_entry_by_offset_for_data(
2204 JournalFile *f,
2205 uint64_t data_offset,
2206 uint64_t p,
2207 direction_t direction,
2208 Object **ret, uint64_t *offset) {
2209
2210 int r;
2211 Object *d;
2212
2213 assert(f);
2214
2215 r = journal_file_move_to_object(f, OBJECT_DATA, data_offset, &d);
2216 if (r < 0)
2217 return r;
2218
2219 return generic_array_bisect_plus_one(f,
2220 le64toh(d->data.entry_offset),
2221 le64toh(d->data.entry_array_offset),
2222 le64toh(d->data.n_entries),
2223 p,
2224 test_object_offset,
2225 direction,
2226 ret, offset, NULL);
2227}
2228
2229int journal_file_move_to_entry_by_monotonic_for_data(
2230 JournalFile *f,
2231 uint64_t data_offset,
2232 sd_id128_t boot_id,
2233 uint64_t monotonic,
2234 direction_t direction,
2235 Object **ret, uint64_t *offset) {
2236
cbdca852
LP
2237 Object *o, *d;
2238 int r;
2239 uint64_t b, z;
2240
2241 assert(f);
2242
2243 /* First, seek by time */
47838ab3 2244 r = find_data_object_by_boot_id(f, boot_id, &o, &b);
cbdca852
LP
2245 if (r < 0)
2246 return r;
2247 if (r == 0)
2248 return -ENOENT;
2249
2250 r = generic_array_bisect_plus_one(f,
2251 le64toh(o->data.entry_offset),
2252 le64toh(o->data.entry_array_offset),
2253 le64toh(o->data.n_entries),
2254 monotonic,
2255 test_object_monotonic,
2256 direction,
2257 NULL, &z, NULL);
2258 if (r <= 0)
2259 return r;
2260
2261 /* And now, continue seeking until we find an entry that
2262 * exists in both bisection arrays */
2263
2264 for (;;) {
2265 Object *qo;
2266 uint64_t p, q;
2267
2268 r = journal_file_move_to_object(f, OBJECT_DATA, data_offset, &d);
2269 if (r < 0)
2270 return r;
2271
2272 r = generic_array_bisect_plus_one(f,
2273 le64toh(d->data.entry_offset),
2274 le64toh(d->data.entry_array_offset),
2275 le64toh(d->data.n_entries),
2276 z,
2277 test_object_offset,
2278 direction,
2279 NULL, &p, NULL);
2280 if (r <= 0)
2281 return r;
2282
2283 r = journal_file_move_to_object(f, OBJECT_DATA, b, &o);
2284 if (r < 0)
2285 return r;
2286
2287 r = generic_array_bisect_plus_one(f,
2288 le64toh(o->data.entry_offset),
2289 le64toh(o->data.entry_array_offset),
2290 le64toh(o->data.n_entries),
2291 p,
2292 test_object_offset,
2293 direction,
2294 &qo, &q, NULL);
2295
2296 if (r <= 0)
2297 return r;
2298
2299 if (p == q) {
2300 if (ret)
2301 *ret = qo;
2302 if (offset)
2303 *offset = q;
2304
2305 return 1;
2306 }
2307
2308 z = q;
2309 }
cbdca852
LP
2310}
2311
de190aef
LP
2312int journal_file_move_to_entry_by_seqnum_for_data(
2313 JournalFile *f,
2314 uint64_t data_offset,
2315 uint64_t seqnum,
2316 direction_t direction,
2317 Object **ret, uint64_t *offset) {
cec736d2 2318
de190aef
LP
2319 Object *d;
2320 int r;
cec736d2 2321
91a31dde
LP
2322 assert(f);
2323
de190aef 2324 r = journal_file_move_to_object(f, OBJECT_DATA, data_offset, &d);
91a31dde 2325 if (r < 0)
de190aef 2326 return r;
cec736d2 2327
de190aef
LP
2328 return generic_array_bisect_plus_one(f,
2329 le64toh(d->data.entry_offset),
2330 le64toh(d->data.entry_array_offset),
2331 le64toh(d->data.n_entries),
2332 seqnum,
2333 test_object_seqnum,
2334 direction,
2335 ret, offset, NULL);
2336}
cec736d2 2337
de190aef
LP
2338int journal_file_move_to_entry_by_realtime_for_data(
2339 JournalFile *f,
2340 uint64_t data_offset,
2341 uint64_t realtime,
2342 direction_t direction,
2343 Object **ret, uint64_t *offset) {
2344
2345 Object *d;
2346 int r;
2347
91a31dde
LP
2348 assert(f);
2349
de190aef 2350 r = journal_file_move_to_object(f, OBJECT_DATA, data_offset, &d);
91a31dde 2351 if (r < 0)
de190aef
LP
2352 return r;
2353
2354 return generic_array_bisect_plus_one(f,
2355 le64toh(d->data.entry_offset),
2356 le64toh(d->data.entry_array_offset),
2357 le64toh(d->data.n_entries),
2358 realtime,
2359 test_object_realtime,
2360 direction,
2361 ret, offset, NULL);
cec736d2
LP
2362}
2363
0284adc6 2364void journal_file_dump(JournalFile *f) {
7560fffc 2365 Object *o;
7560fffc 2366 int r;
0284adc6 2367 uint64_t p;
7560fffc
LP
2368
2369 assert(f);
2370
0284adc6 2371 journal_file_print_header(f);
7560fffc 2372
0284adc6
LP
2373 p = le64toh(f->header->header_size);
2374 while (p != 0) {
d05089d8 2375 r = journal_file_move_to_object(f, OBJECT_UNUSED, p, &o);
0284adc6
LP
2376 if (r < 0)
2377 goto fail;
7560fffc 2378
0284adc6 2379 switch (o->object.type) {
d98cc1f2 2380
0284adc6
LP
2381 case OBJECT_UNUSED:
2382 printf("Type: OBJECT_UNUSED\n");
2383 break;
d98cc1f2 2384
0284adc6
LP
2385 case OBJECT_DATA:
2386 printf("Type: OBJECT_DATA\n");
2387 break;
7560fffc 2388
3c1668da
LP
2389 case OBJECT_FIELD:
2390 printf("Type: OBJECT_FIELD\n");
2391 break;
2392
0284adc6 2393 case OBJECT_ENTRY:
507f22bd
ZJS
2394 printf("Type: OBJECT_ENTRY seqnum=%"PRIu64" monotonic=%"PRIu64" realtime=%"PRIu64"\n",
2395 le64toh(o->entry.seqnum),
2396 le64toh(o->entry.monotonic),
2397 le64toh(o->entry.realtime));
0284adc6 2398 break;
7560fffc 2399
0284adc6
LP
2400 case OBJECT_FIELD_HASH_TABLE:
2401 printf("Type: OBJECT_FIELD_HASH_TABLE\n");
2402 break;
7560fffc 2403
0284adc6
LP
2404 case OBJECT_DATA_HASH_TABLE:
2405 printf("Type: OBJECT_DATA_HASH_TABLE\n");
2406 break;
7560fffc 2407
0284adc6
LP
2408 case OBJECT_ENTRY_ARRAY:
2409 printf("Type: OBJECT_ENTRY_ARRAY\n");
2410 break;
7560fffc 2411
0284adc6 2412 case OBJECT_TAG:
507f22bd
ZJS
2413 printf("Type: OBJECT_TAG seqnum=%"PRIu64" epoch=%"PRIu64"\n",
2414 le64toh(o->tag.seqnum),
2415 le64toh(o->tag.epoch));
0284adc6 2416 break;
3c1668da
LP
2417
2418 default:
8facc349 2419 printf("Type: unknown (%i)\n", o->object.type);
3c1668da 2420 break;
0284adc6 2421 }
7560fffc 2422
d89c8fdf
ZJS
2423 if (o->object.flags & OBJECT_COMPRESSION_MASK)
2424 printf("Flags: %s\n",
2425 object_compressed_to_string(o->object.flags & OBJECT_COMPRESSION_MASK));
7560fffc 2426
0284adc6
LP
2427 if (p == le64toh(f->header->tail_object_offset))
2428 p = 0;
2429 else
2430 p = p + ALIGN64(le64toh(o->object.size));
2431 }
7560fffc 2432
0284adc6
LP
2433 return;
2434fail:
2435 log_error("File corrupt");
7560fffc
LP
2436}
2437
718fe4b1
ZJS
2438static const char* format_timestamp_safe(char *buf, size_t l, usec_t t) {
2439 const char *x;
2440
2441 x = format_timestamp(buf, l, t);
2442 if (x)
2443 return x;
2444 return " --- ";
2445}
2446
0284adc6 2447void journal_file_print_header(JournalFile *f) {
2765b7bb 2448 char a[33], b[33], c[33], d[33];
ed375beb 2449 char x[FORMAT_TIMESTAMP_MAX], y[FORMAT_TIMESTAMP_MAX], z[FORMAT_TIMESTAMP_MAX];
a1a03e30
LP
2450 struct stat st;
2451 char bytes[FORMAT_BYTES_MAX];
7560fffc
LP
2452
2453 assert(f);
7560fffc 2454
0284adc6
LP
2455 printf("File Path: %s\n"
2456 "File ID: %s\n"
2457 "Machine ID: %s\n"
2458 "Boot ID: %s\n"
2459 "Sequential Number ID: %s\n"
2460 "State: %s\n"
2461 "Compatible Flags:%s%s\n"
d89c8fdf 2462 "Incompatible Flags:%s%s%s\n"
507f22bd
ZJS
2463 "Header size: %"PRIu64"\n"
2464 "Arena size: %"PRIu64"\n"
2465 "Data Hash Table Size: %"PRIu64"\n"
2466 "Field Hash Table Size: %"PRIu64"\n"
0284adc6 2467 "Rotate Suggested: %s\n"
507f22bd
ZJS
2468 "Head Sequential Number: %"PRIu64"\n"
2469 "Tail Sequential Number: %"PRIu64"\n"
0284adc6 2470 "Head Realtime Timestamp: %s\n"
3223f44f 2471 "Tail Realtime Timestamp: %s\n"
ed375beb 2472 "Tail Monotonic Timestamp: %s\n"
507f22bd
ZJS
2473 "Objects: %"PRIu64"\n"
2474 "Entry Objects: %"PRIu64"\n",
0284adc6
LP
2475 f->path,
2476 sd_id128_to_string(f->header->file_id, a),
2477 sd_id128_to_string(f->header->machine_id, b),
2478 sd_id128_to_string(f->header->boot_id, c),
2765b7bb 2479 sd_id128_to_string(f->header->seqnum_id, d),
3223f44f
LP
2480 f->header->state == STATE_OFFLINE ? "OFFLINE" :
2481 f->header->state == STATE_ONLINE ? "ONLINE" :
2482 f->header->state == STATE_ARCHIVED ? "ARCHIVED" : "UNKNOWN",
8088cbd3 2483 JOURNAL_HEADER_SEALED(f->header) ? " SEALED" : "",
d89c8fdf
ZJS
2484 (le32toh(f->header->compatible_flags) & ~HEADER_COMPATIBLE_ANY) ? " ???" : "",
2485 JOURNAL_HEADER_COMPRESSED_XZ(f->header) ? " COMPRESSED-XZ" : "",
2486 JOURNAL_HEADER_COMPRESSED_LZ4(f->header) ? " COMPRESSED-LZ4" : "",
2487 (le32toh(f->header->incompatible_flags) & ~HEADER_INCOMPATIBLE_ANY) ? " ???" : "",
507f22bd
ZJS
2488 le64toh(f->header->header_size),
2489 le64toh(f->header->arena_size),
2490 le64toh(f->header->data_hash_table_size) / sizeof(HashItem),
2491 le64toh(f->header->field_hash_table_size) / sizeof(HashItem),
fb0951b0 2492 yes_no(journal_file_rotate_suggested(f, 0)),
507f22bd
ZJS
2493 le64toh(f->header->head_entry_seqnum),
2494 le64toh(f->header->tail_entry_seqnum),
718fe4b1
ZJS
2495 format_timestamp_safe(x, sizeof(x), le64toh(f->header->head_entry_realtime)),
2496 format_timestamp_safe(y, sizeof(y), le64toh(f->header->tail_entry_realtime)),
ed375beb 2497 format_timespan(z, sizeof(z), le64toh(f->header->tail_entry_monotonic), USEC_PER_MSEC),
507f22bd
ZJS
2498 le64toh(f->header->n_objects),
2499 le64toh(f->header->n_entries));
7560fffc 2500
0284adc6 2501 if (JOURNAL_HEADER_CONTAINS(f->header, n_data))
507f22bd 2502 printf("Data Objects: %"PRIu64"\n"
0284adc6 2503 "Data Hash Table Fill: %.1f%%\n",
507f22bd 2504 le64toh(f->header->n_data),
0284adc6 2505 100.0 * (double) le64toh(f->header->n_data) / ((double) (le64toh(f->header->data_hash_table_size) / sizeof(HashItem))));
7560fffc 2506
0284adc6 2507 if (JOURNAL_HEADER_CONTAINS(f->header, n_fields))
507f22bd 2508 printf("Field Objects: %"PRIu64"\n"
0284adc6 2509 "Field Hash Table Fill: %.1f%%\n",
507f22bd 2510 le64toh(f->header->n_fields),
0284adc6 2511 100.0 * (double) le64toh(f->header->n_fields) / ((double) (le64toh(f->header->field_hash_table_size) / sizeof(HashItem))));
3223f44f
LP
2512
2513 if (JOURNAL_HEADER_CONTAINS(f->header, n_tags))
507f22bd
ZJS
2514 printf("Tag Objects: %"PRIu64"\n",
2515 le64toh(f->header->n_tags));
3223f44f 2516 if (JOURNAL_HEADER_CONTAINS(f->header, n_entry_arrays))
507f22bd
ZJS
2517 printf("Entry Array Objects: %"PRIu64"\n",
2518 le64toh(f->header->n_entry_arrays));
a1a03e30
LP
2519
2520 if (fstat(f->fd, &st) >= 0)
2521 printf("Disk usage: %s\n", format_bytes(bytes, sizeof(bytes), (off_t) st.st_blocks * 512ULL));
7560fffc
LP
2522}
2523
0284adc6
LP
2524int journal_file_open(
2525 const char *fname,
2526 int flags,
2527 mode_t mode,
2528 bool compress,
baed47c3 2529 bool seal,
0284adc6
LP
2530 JournalMetrics *metrics,
2531 MMapCache *mmap_cache,
2532 JournalFile *template,
2533 JournalFile **ret) {
7560fffc 2534
fa6ac760 2535 bool newly_created = false;
0284adc6 2536 JournalFile *f;
fa6ac760 2537 void *h;
0284adc6 2538 int r;
7560fffc 2539
0284adc6 2540 assert(fname);
0559d3a5 2541 assert(ret);
7560fffc 2542
0284adc6
LP
2543 if ((flags & O_ACCMODE) != O_RDONLY &&
2544 (flags & O_ACCMODE) != O_RDWR)
2545 return -EINVAL;
7560fffc 2546
a0108012
LP
2547 if (!endswith(fname, ".journal") &&
2548 !endswith(fname, ".journal~"))
0284adc6 2549 return -EINVAL;
7560fffc 2550
0284adc6
LP
2551 f = new0(JournalFile, 1);
2552 if (!f)
2553 return -ENOMEM;
7560fffc 2554
0284adc6
LP
2555 f->fd = -1;
2556 f->mode = mode;
7560fffc 2557
0284adc6
LP
2558 f->flags = flags;
2559 f->prot = prot_from_flags(flags);
2560 f->writable = (flags & O_ACCMODE) != O_RDONLY;
92261977 2561#if defined(HAVE_LZ4)
d89c8fdf
ZJS
2562 f->compress_lz4 = compress;
2563#elif defined(HAVE_XZ)
2564 f->compress_xz = compress;
48b61739 2565#endif
49a32d43 2566#ifdef HAVE_GCRYPT
baed47c3 2567 f->seal = seal;
49a32d43 2568#endif
7560fffc 2569
0284adc6
LP
2570 if (mmap_cache)
2571 f->mmap = mmap_cache_ref(mmap_cache);
2572 else {
84168d80 2573 f->mmap = mmap_cache_new();
0284adc6
LP
2574 if (!f->mmap) {
2575 r = -ENOMEM;
2576 goto fail;
2577 }
2578 }
7560fffc 2579
0284adc6
LP
2580 f->path = strdup(fname);
2581 if (!f->path) {
2582 r = -ENOMEM;
2583 goto fail;
2584 }
7560fffc 2585
4743015d 2586 f->chain_cache = ordered_hashmap_new(&uint64_hash_ops);
a4bcff5b
LP
2587 if (!f->chain_cache) {
2588 r = -ENOMEM;
2589 goto fail;
2590 }
2591
0284adc6
LP
2592 f->fd = open(f->path, f->flags|O_CLOEXEC, f->mode);
2593 if (f->fd < 0) {
2594 r = -errno;
2595 goto fail;
7560fffc 2596 }
7560fffc 2597
2678031a
LP
2598 r = journal_file_fstat(f);
2599 if (r < 0)
0284adc6 2600 goto fail;
7560fffc 2601
0284adc6 2602 if (f->last_stat.st_size == 0 && f->writable) {
11689d2a
LP
2603
2604 /* Before we write anything, turn off COW logic. Given
2605 * our write pattern that is quite unfriendly to COW
2606 * file systems this should greatly improve
2607 * performance on COW file systems, such as btrfs, at
2608 * the expense of data integrity features (which
2609 * shouldn't be too bad, given that we do our own
2610 * checksumming). */
1ed8f8c1 2611 r = chattr_fd(f->fd, FS_NOCOW_FL, FS_NOCOW_FL);
65eae3b7
CR
2612 if (r < 0 && r != -ENOTTY)
2613 log_warning_errno(r, "Failed to set file attributes: %m");
11689d2a 2614
fb0951b0
LP
2615 /* Let's attach the creation time to the journal file,
2616 * so that the vacuuming code knows the age of this
2617 * file even if the file might end up corrupted one
2618 * day... Ideally we'd just use the creation time many
2619 * file systems maintain for each file, but there is
2620 * currently no usable API to query this, hence let's
2621 * emulate this via extended attributes. If extended
2622 * attributes are not supported we'll just skip this,
7517e174 2623 * and rely solely on mtime/atime/ctime of the file. */
fb0951b0 2624
d61b600d 2625 fd_setcrtime(f->fd, 0);
7560fffc 2626
feb12d3e 2627#ifdef HAVE_GCRYPT
0284adc6 2628 /* Try to load the FSPRG state, and if we can't, then
baed47c3 2629 * just don't do sealing */
49a32d43
LP
2630 if (f->seal) {
2631 r = journal_file_fss_load(f);
2632 if (r < 0)
2633 f->seal = false;
2634 }
feb12d3e 2635#endif
7560fffc 2636
0284adc6
LP
2637 r = journal_file_init_header(f, template);
2638 if (r < 0)
2639 goto fail;
7560fffc 2640
2678031a
LP
2641 r = journal_file_fstat(f);
2642 if (r < 0)
0284adc6 2643 goto fail;
fb0951b0
LP
2644
2645 newly_created = true;
0284adc6 2646 }
7560fffc 2647
0284adc6
LP
2648 if (f->last_stat.st_size < (off_t) HEADER_SIZE_MIN) {
2649 r = -EIO;
2650 goto fail;
2651 }
7560fffc 2652
fa6ac760 2653 r = mmap_cache_get(f->mmap, f->fd, f->prot, CONTEXT_HEADER, true, 0, PAGE_ALIGN(sizeof(Header)), &f->last_stat, &h);
977eaa1e 2654 if (r < 0)
0284adc6 2655 goto fail;
7560fffc 2656
fa6ac760
LP
2657 f->header = h;
2658
0284adc6
LP
2659 if (!newly_created) {
2660 r = journal_file_verify_header(f);
2661 if (r < 0)
2662 goto fail;
2663 }
7560fffc 2664
feb12d3e 2665#ifdef HAVE_GCRYPT
0284adc6 2666 if (!newly_created && f->writable) {
baed47c3 2667 r = journal_file_fss_load(f);
0284adc6
LP
2668 if (r < 0)
2669 goto fail;
2670 }
feb12d3e 2671#endif
cec736d2
LP
2672
2673 if (f->writable) {
4a92baf3
LP
2674 if (metrics) {
2675 journal_default_metrics(metrics, f->fd);
2676 f->metrics = *metrics;
2677 } else if (template)
2678 f->metrics = template->metrics;
2679
cec736d2
LP
2680 r = journal_file_refresh_header(f);
2681 if (r < 0)
2682 goto fail;
2683 }
2684
feb12d3e 2685#ifdef HAVE_GCRYPT
baed47c3 2686 r = journal_file_hmac_setup(f);
14d10188
LP
2687 if (r < 0)
2688 goto fail;
feb12d3e 2689#endif
14d10188 2690
cec736d2 2691 if (newly_created) {
de190aef 2692 r = journal_file_setup_field_hash_table(f);
cec736d2
LP
2693 if (r < 0)
2694 goto fail;
2695
de190aef 2696 r = journal_file_setup_data_hash_table(f);
cec736d2
LP
2697 if (r < 0)
2698 goto fail;
7560fffc 2699
feb12d3e 2700#ifdef HAVE_GCRYPT
7560fffc
LP
2701 r = journal_file_append_first_tag(f);
2702 if (r < 0)
2703 goto fail;
feb12d3e 2704#endif
cec736d2
LP
2705 }
2706
de190aef 2707 r = journal_file_map_field_hash_table(f);
cec736d2
LP
2708 if (r < 0)
2709 goto fail;
2710
de190aef 2711 r = journal_file_map_data_hash_table(f);
cec736d2
LP
2712 if (r < 0)
2713 goto fail;
2714
fa6ac760
LP
2715 if (mmap_cache_got_sigbus(f->mmap, f->fd)) {
2716 r = -EIO;
2717 goto fail;
2718 }
2719
0559d3a5 2720 *ret = f;
cec736d2
LP
2721 return 0;
2722
2723fail:
fa6ac760
LP
2724 if (f->fd >= 0 && mmap_cache_got_sigbus(f->mmap, f->fd))
2725 r = -EIO;
2726
cec736d2
LP
2727 journal_file_close(f);
2728
2729 return r;
2730}
0ac38b70 2731
baed47c3 2732int journal_file_rotate(JournalFile **f, bool compress, bool seal) {
57535f47 2733 _cleanup_free_ char *p = NULL;
0ac38b70
LP
2734 size_t l;
2735 JournalFile *old_file, *new_file = NULL;
2736 int r;
2737
2738 assert(f);
2739 assert(*f);
2740
2741 old_file = *f;
2742
2743 if (!old_file->writable)
2744 return -EINVAL;
2745
2746 if (!endswith(old_file->path, ".journal"))
2747 return -EINVAL;
2748
2749 l = strlen(old_file->path);
57535f47
ZJS
2750 r = asprintf(&p, "%.*s@" SD_ID128_FORMAT_STR "-%016"PRIx64"-%016"PRIx64".journal",
2751 (int) l - 8, old_file->path,
2752 SD_ID128_FORMAT_VAL(old_file->header->seqnum_id),
2753 le64toh((*f)->header->head_entry_seqnum),
2754 le64toh((*f)->header->head_entry_realtime));
2755 if (r < 0)
0ac38b70
LP
2756 return -ENOMEM;
2757
2678031a
LP
2758 /* Try to rename the file to the archived version. If the file
2759 * already was deleted, we'll get ENOENT, let's ignore that
2760 * case. */
0ac38b70 2761 r = rename(old_file->path, p);
2678031a 2762 if (r < 0 && errno != ENOENT)
0ac38b70
LP
2763 return -errno;
2764
ccdbaf91 2765 old_file->header->state = STATE_ARCHIVED;
0ac38b70 2766
f27a3864
LP
2767 /* Currently, btrfs is not very good with out write patterns
2768 * and fragments heavily. Let's defrag our journal files when
2769 * we archive them */
2770 old_file->defrag_on_close = true;
2771
baed47c3 2772 r = journal_file_open(old_file->path, old_file->flags, old_file->mode, compress, seal, NULL, old_file->mmap, old_file, &new_file);
0ac38b70
LP
2773 journal_file_close(old_file);
2774
2775 *f = new_file;
2776 return r;
2777}
2778
9447a7f1
LP
2779int journal_file_open_reliably(
2780 const char *fname,
2781 int flags,
2782 mode_t mode,
7560fffc 2783 bool compress,
baed47c3 2784 bool seal,
4a92baf3 2785 JournalMetrics *metrics,
27370278 2786 MMapCache *mmap_cache,
9447a7f1
LP
2787 JournalFile *template,
2788 JournalFile **ret) {
2789
2790 int r;
2791 size_t l;
ed375beb 2792 _cleanup_free_ char *p = NULL;
9447a7f1 2793
baed47c3 2794 r = journal_file_open(fname, flags, mode, compress, seal,
27370278 2795 metrics, mmap_cache, template, ret);
288359db
ZJS
2796 if (!IN_SET(r,
2797 -EBADMSG, /* corrupted */
2798 -ENODATA, /* truncated */
2799 -EHOSTDOWN, /* other machine */
2800 -EPROTONOSUPPORT, /* incompatible feature */
2801 -EBUSY, /* unclean shutdown */
2802 -ESHUTDOWN, /* already archived */
2803 -EIO, /* IO error, including SIGBUS on mmap */
2804 -EIDRM /* File has been deleted */))
9447a7f1
LP
2805 return r;
2806
2807 if ((flags & O_ACCMODE) == O_RDONLY)
2808 return r;
2809
2810 if (!(flags & O_CREAT))
2811 return r;
2812
7560fffc
LP
2813 if (!endswith(fname, ".journal"))
2814 return r;
2815
5c70eab4
LP
2816 /* The file is corrupted. Rotate it away and try it again (but only once) */
2817
9447a7f1 2818 l = strlen(fname);
d587eca5 2819 if (asprintf(&p, "%.*s@%016"PRIx64 "-%016"PRIx64 ".journal~",
57535f47 2820 (int) l - 8, fname,
d587eca5 2821 now(CLOCK_REALTIME),
9bf3b535 2822 random_u64()) < 0)
9447a7f1
LP
2823 return -ENOMEM;
2824
2825 r = rename(fname, p);
9447a7f1
LP
2826 if (r < 0)
2827 return -errno;
2828
f27a3864
LP
2829 /* btrfs doesn't cope well with our write pattern and
2830 * fragments heavily. Let's defrag all files we rotate */
11689d2a
LP
2831
2832 (void) chattr_path(p, false, FS_NOCOW_FL);
f27a3864
LP
2833 (void) btrfs_defrag(p);
2834
a1a1898f 2835 log_warning("File %s corrupted or uncleanly shut down, renaming and replacing.", fname);
9447a7f1 2836
baed47c3 2837 return journal_file_open(fname, flags, mode, compress, seal,
27370278 2838 metrics, mmap_cache, template, ret);
9447a7f1
LP
2839}
2840
cf244689
LP
2841int journal_file_copy_entry(JournalFile *from, JournalFile *to, Object *o, uint64_t p, uint64_t *seqnum, Object **ret, uint64_t *offset) {
2842 uint64_t i, n;
2843 uint64_t q, xor_hash = 0;
2844 int r;
2845 EntryItem *items;
2846 dual_timestamp ts;
2847
2848 assert(from);
2849 assert(to);
2850 assert(o);
2851 assert(p);
2852
2853 if (!to->writable)
2854 return -EPERM;
2855
2856 ts.monotonic = le64toh(o->entry.monotonic);
2857 ts.realtime = le64toh(o->entry.realtime);
2858
cf244689 2859 n = journal_file_entry_n_items(o);
4faa7004
TA
2860 /* alloca() can't take 0, hence let's allocate at least one */
2861 items = alloca(sizeof(EntryItem) * MAX(1u, n));
cf244689
LP
2862
2863 for (i = 0; i < n; i++) {
4fd052ae
FC
2864 uint64_t l, h;
2865 le64_t le_hash;
cf244689
LP
2866 size_t t;
2867 void *data;
2868 Object *u;
2869
2870 q = le64toh(o->entry.items[i].object_offset);
2871 le_hash = o->entry.items[i].hash;
2872
2873 r = journal_file_move_to_object(from, OBJECT_DATA, q, &o);
2874 if (r < 0)
2875 return r;
2876
2877 if (le_hash != o->data.hash)
2878 return -EBADMSG;
2879
2880 l = le64toh(o->object.size) - offsetof(Object, data.payload);
2881 t = (size_t) l;
2882
2883 /* We hit the limit on 32bit machines */
2884 if ((uint64_t) t != l)
2885 return -E2BIG;
2886
d89c8fdf 2887 if (o->object.flags & OBJECT_COMPRESSION_MASK) {
3b1a55e1 2888#if defined(HAVE_XZ) || defined(HAVE_LZ4)
a7f7d1bd 2889 size_t rsize = 0;
cf244689 2890
d89c8fdf
ZJS
2891 r = decompress_blob(o->object.flags & OBJECT_COMPRESSION_MASK,
2892 o->data.payload, l, &from->compress_buffer, &from->compress_buffer_size, &rsize, 0);
2893 if (r < 0)
2894 return r;
cf244689
LP
2895
2896 data = from->compress_buffer;
2897 l = rsize;
3b1a55e1
ZJS
2898#else
2899 return -EPROTONOSUPPORT;
2900#endif
cf244689
LP
2901 } else
2902 data = o->data.payload;
2903
2904 r = journal_file_append_data(to, data, l, &u, &h);
2905 if (r < 0)
2906 return r;
2907
2908 xor_hash ^= le64toh(u->data.hash);
2909 items[i].object_offset = htole64(h);
2910 items[i].hash = u->data.hash;
2911
2912 r = journal_file_move_to_object(from, OBJECT_ENTRY, p, &o);
2913 if (r < 0)
2914 return r;
2915 }
2916
fa6ac760
LP
2917 r = journal_file_append_entry_internal(to, &ts, xor_hash, items, n, seqnum, ret, offset);
2918
2919 if (mmap_cache_got_sigbus(to->mmap, to->fd))
2920 return -EIO;
2921
2922 return r;
cf244689 2923}
babfc091
LP
2924
2925void journal_default_metrics(JournalMetrics *m, int fd) {
2926 uint64_t fs_size = 0;
2927 struct statvfs ss;
a7bc2c2a 2928 char a[FORMAT_BYTES_MAX], b[FORMAT_BYTES_MAX], c[FORMAT_BYTES_MAX], d[FORMAT_BYTES_MAX];
babfc091
LP
2929
2930 assert(m);
2931 assert(fd >= 0);
2932
2933 if (fstatvfs(fd, &ss) >= 0)
2934 fs_size = ss.f_frsize * ss.f_blocks;
2935
2936 if (m->max_use == (uint64_t) -1) {
2937
2938 if (fs_size > 0) {
2939 m->max_use = PAGE_ALIGN(fs_size / 10); /* 10% of file system size */
2940
2941 if (m->max_use > DEFAULT_MAX_USE_UPPER)
2942 m->max_use = DEFAULT_MAX_USE_UPPER;
2943
2944 if (m->max_use < DEFAULT_MAX_USE_LOWER)
2945 m->max_use = DEFAULT_MAX_USE_LOWER;
2946 } else
2947 m->max_use = DEFAULT_MAX_USE_LOWER;
2948 } else {
2949 m->max_use = PAGE_ALIGN(m->max_use);
2950
2951 if (m->max_use < JOURNAL_FILE_SIZE_MIN*2)
2952 m->max_use = JOURNAL_FILE_SIZE_MIN*2;
2953 }
2954
2955 if (m->max_size == (uint64_t) -1) {
2956 m->max_size = PAGE_ALIGN(m->max_use / 8); /* 8 chunks */
2957
2958 if (m->max_size > DEFAULT_MAX_SIZE_UPPER)
2959 m->max_size = DEFAULT_MAX_SIZE_UPPER;
2960 } else
2961 m->max_size = PAGE_ALIGN(m->max_size);
2962
2963 if (m->max_size < JOURNAL_FILE_SIZE_MIN)
2964 m->max_size = JOURNAL_FILE_SIZE_MIN;
2965
2966 if (m->max_size*2 > m->max_use)
2967 m->max_use = m->max_size*2;
2968
2969 if (m->min_size == (uint64_t) -1)
2970 m->min_size = JOURNAL_FILE_SIZE_MIN;
2971 else {
2972 m->min_size = PAGE_ALIGN(m->min_size);
2973
2974 if (m->min_size < JOURNAL_FILE_SIZE_MIN)
2975 m->min_size = JOURNAL_FILE_SIZE_MIN;
2976
2977 if (m->min_size > m->max_size)
2978 m->max_size = m->min_size;
2979 }
2980
2981 if (m->keep_free == (uint64_t) -1) {
2982
2983 if (fs_size > 0) {
8621b110 2984 m->keep_free = PAGE_ALIGN(fs_size * 3 / 20); /* 15% of file system size */
babfc091
LP
2985
2986 if (m->keep_free > DEFAULT_KEEP_FREE_UPPER)
2987 m->keep_free = DEFAULT_KEEP_FREE_UPPER;
2988
2989 } else
2990 m->keep_free = DEFAULT_KEEP_FREE;
2991 }
2992
2b43f939
LP
2993 log_debug("Fixed max_use=%s max_size=%s min_size=%s keep_free=%s",
2994 format_bytes(a, sizeof(a), m->max_use),
2995 format_bytes(b, sizeof(b), m->max_size),
2996 format_bytes(c, sizeof(c), m->min_size),
2997 format_bytes(d, sizeof(d), m->keep_free));
babfc091 2998}
08984293
LP
2999
3000int journal_file_get_cutoff_realtime_usec(JournalFile *f, usec_t *from, usec_t *to) {
08984293
LP
3001 assert(f);
3002 assert(from || to);
3003
3004 if (from) {
162566a4
LP
3005 if (f->header->head_entry_realtime == 0)
3006 return -ENOENT;
08984293 3007
162566a4 3008 *from = le64toh(f->header->head_entry_realtime);
08984293
LP
3009 }
3010
3011 if (to) {
162566a4
LP
3012 if (f->header->tail_entry_realtime == 0)
3013 return -ENOENT;
08984293 3014
162566a4 3015 *to = le64toh(f->header->tail_entry_realtime);
08984293
LP
3016 }
3017
3018 return 1;
3019}
3020
3021int journal_file_get_cutoff_monotonic_usec(JournalFile *f, sd_id128_t boot_id, usec_t *from, usec_t *to) {
08984293
LP
3022 Object *o;
3023 uint64_t p;
3024 int r;
3025
3026 assert(f);
3027 assert(from || to);
3028
47838ab3 3029 r = find_data_object_by_boot_id(f, boot_id, &o, &p);
08984293
LP
3030 if (r <= 0)
3031 return r;
3032
3033 if (le64toh(o->data.n_entries) <= 0)
3034 return 0;
3035
3036 if (from) {
3037 r = journal_file_move_to_object(f, OBJECT_ENTRY, le64toh(o->data.entry_offset), &o);
3038 if (r < 0)
3039 return r;
3040
3041 *from = le64toh(o->entry.monotonic);
3042 }
3043
3044 if (to) {
3045 r = journal_file_move_to_object(f, OBJECT_DATA, p, &o);
3046 if (r < 0)
3047 return r;
3048
3049 r = generic_array_get_plus_one(f,
3050 le64toh(o->data.entry_offset),
3051 le64toh(o->data.entry_array_offset),
3052 le64toh(o->data.n_entries)-1,
3053 &o, NULL);
3054 if (r <= 0)
3055 return r;
3056
3057 *to = le64toh(o->entry.monotonic);
3058 }
3059
3060 return 1;
3061}
dca6219e 3062
fb0951b0 3063bool journal_file_rotate_suggested(JournalFile *f, usec_t max_file_usec) {
dca6219e
LP
3064 assert(f);
3065
3066 /* If we gained new header fields we gained new features,
3067 * hence suggest a rotation */
361f9cbc
LP
3068 if (le64toh(f->header->header_size) < sizeof(Header)) {
3069 log_debug("%s uses an outdated header, suggesting rotation.", f->path);
dca6219e 3070 return true;
361f9cbc 3071 }
dca6219e
LP
3072
3073 /* Let's check if the hash tables grew over a certain fill
3074 * level (75%, borrowing this value from Java's hash table
3075 * implementation), and if so suggest a rotation. To calculate
3076 * the fill level we need the n_data field, which only exists
3077 * in newer versions. */
3078
3079 if (JOURNAL_HEADER_CONTAINS(f->header, n_data))
361f9cbc 3080 if (le64toh(f->header->n_data) * 4ULL > (le64toh(f->header->data_hash_table_size) / sizeof(HashItem)) * 3ULL) {
507f22bd 3081 log_debug("Data hash table of %s has a fill level at %.1f (%"PRIu64" of %"PRIu64" items, %llu file size, %"PRIu64" bytes per hash table item), suggesting rotation.",
361f9cbc
LP
3082 f->path,
3083 100.0 * (double) le64toh(f->header->n_data) / ((double) (le64toh(f->header->data_hash_table_size) / sizeof(HashItem))),
507f22bd
ZJS
3084 le64toh(f->header->n_data),
3085 le64toh(f->header->data_hash_table_size) / sizeof(HashItem),
3086 (unsigned long long) f->last_stat.st_size,
3087 f->last_stat.st_size / le64toh(f->header->n_data));
dca6219e 3088 return true;
361f9cbc 3089 }
dca6219e
LP
3090
3091 if (JOURNAL_HEADER_CONTAINS(f->header, n_fields))
361f9cbc 3092 if (le64toh(f->header->n_fields) * 4ULL > (le64toh(f->header->field_hash_table_size) / sizeof(HashItem)) * 3ULL) {
507f22bd 3093 log_debug("Field hash table of %s has a fill level at %.1f (%"PRIu64" of %"PRIu64" items), suggesting rotation.",
361f9cbc
LP
3094 f->path,
3095 100.0 * (double) le64toh(f->header->n_fields) / ((double) (le64toh(f->header->field_hash_table_size) / sizeof(HashItem))),
507f22bd
ZJS
3096 le64toh(f->header->n_fields),
3097 le64toh(f->header->field_hash_table_size) / sizeof(HashItem));
dca6219e 3098 return true;
361f9cbc 3099 }
dca6219e 3100
0598fd4a
LP
3101 /* Are the data objects properly indexed by field objects? */
3102 if (JOURNAL_HEADER_CONTAINS(f->header, n_data) &&
3103 JOURNAL_HEADER_CONTAINS(f->header, n_fields) &&
3104 le64toh(f->header->n_data) > 0 &&
3105 le64toh(f->header->n_fields) == 0)
3106 return true;
3107
fb0951b0
LP
3108 if (max_file_usec > 0) {
3109 usec_t t, h;
3110
3111 h = le64toh(f->header->head_entry_realtime);
3112 t = now(CLOCK_REALTIME);
3113
3114 if (h > 0 && t > h + max_file_usec)
3115 return true;
3116 }
3117
dca6219e
LP
3118 return false;
3119}