]> git.ipfire.org Git - thirdparty/systemd.git/blame - src/journal/journal-file.c
resolved: don't accept doing queries for invalid RR types
[thirdparty/systemd.git] / src / journal / journal-file.c
CommitLineData
cec736d2
LP
1/*-*- Mode: C; c-basic-offset: 8; indent-tabs-mode: nil -*-*/
2
3/***
4 This file is part of systemd.
5
6 Copyright 2011 Lennart Poettering
7
8 systemd is free software; you can redistribute it and/or modify it
5430f7f2
LP
9 under the terms of the GNU Lesser General Public License as published by
10 the Free Software Foundation; either version 2.1 of the License, or
cec736d2
LP
11 (at your option) any later version.
12
13 systemd is distributed in the hope that it will be useful, but
14 WITHOUT ANY WARRANTY; without even the implied warranty of
15 MERCHANTABILITY or FITNESS FOR A PARTICULAR PURPOSE. See the GNU
5430f7f2 16 Lesser General Public License for more details.
cec736d2 17
5430f7f2 18 You should have received a copy of the GNU Lesser General Public License
cec736d2
LP
19 along with systemd; If not, see <http://www.gnu.org/licenses/>.
20***/
21
cec736d2 22#include <errno.h>
cec736d2 23#include <fcntl.h>
11689d2a 24#include <linux/fs.h>
07630cea
LP
25#include <stddef.h>
26#include <sys/mman.h>
27#include <sys/statvfs.h>
28#include <sys/uio.h>
29#include <unistd.h>
fb0951b0 30
b5efdb8a 31#include "alloc-util.h"
f27a3864 32#include "btrfs-util.h"
c8b3094d 33#include "chattr-util.h"
07630cea 34#include "compress.h"
3ffd4af2 35#include "fd-util.h"
0284adc6 36#include "journal-authenticate.h"
cec736d2
LP
37#include "journal-def.h"
38#include "journal-file.h"
39#include "lookup3.h"
6bedfcbb 40#include "parse-util.h"
3df3e884 41#include "random-util.h"
07630cea 42#include "string-util.h"
89a5a90c 43#include "xattr-util.h"
cec736d2 44
4a92baf3
LP
45#define DEFAULT_DATA_HASH_TABLE_SIZE (2047ULL*sizeof(HashItem))
46#define DEFAULT_FIELD_HASH_TABLE_SIZE (333ULL*sizeof(HashItem))
cec736d2 47
be19b7df 48#define COMPRESSION_SIZE_THRESHOLD (512ULL)
807e17f0 49
babfc091 50/* This is the minimum journal file size */
16098e93 51#define JOURNAL_FILE_SIZE_MIN (512ULL*1024ULL) /* 512 KiB */
babfc091
LP
52
53/* These are the lower and upper bounds if we deduce the max_use value
54 * from the file system size */
55#define DEFAULT_MAX_USE_LOWER (1ULL*1024ULL*1024ULL) /* 1 MiB */
56#define DEFAULT_MAX_USE_UPPER (4ULL*1024ULL*1024ULL*1024ULL) /* 4 GiB */
57
8580d1f7
LP
58/* This is the default minimal use limit, how much we'll use even if keep_free suggests otherwise. */
59#define DEFAULT_MIN_USE (1ULL*1024ULL*1024ULL) /* 1 MiB */
60
babfc091 61/* This is the upper bound if we deduce max_size from max_use */
71100051 62#define DEFAULT_MAX_SIZE_UPPER (128ULL*1024ULL*1024ULL) /* 128 MiB */
babfc091
LP
63
64/* This is the upper bound if we deduce the keep_free value from the
65 * file system size */
66#define DEFAULT_KEEP_FREE_UPPER (4ULL*1024ULL*1024ULL*1024ULL) /* 4 GiB */
67
68/* This is the keep_free value when we can't determine the system
69 * size */
70#define DEFAULT_KEEP_FREE (1024ULL*1024ULL) /* 1 MB */
71
8580d1f7
LP
72/* This is the default maximum number of journal files to keep around. */
73#define DEFAULT_N_MAX_FILES (100)
74
dca6219e
LP
75/* n_data was the first entry we added after the initial file format design */
76#define HEADER_SIZE_MIN ALIGN64(offsetof(Header, n_data))
cec736d2 77
a4bcff5b
LP
78/* How many entries to keep in the entry array chain cache at max */
79#define CHAIN_CACHE_MAX 20
80
a676e665
LP
81/* How much to increase the journal file size at once each time we allocate something new. */
82#define FILE_SIZE_INCREASE (8ULL*1024ULL*1024ULL) /* 8MB */
83
2678031a
LP
84/* Reread fstat() of the file for detecting deletions at least this often */
85#define LAST_STAT_REFRESH_USEC (5*USEC_PER_SEC)
86
fa6ac760
LP
87/* The mmap context to use for the header we pick as one above the last defined typed */
88#define CONTEXT_HEADER _OBJECT_TYPE_MAX
89
9588bc32 90static int journal_file_set_online(JournalFile *f) {
26687bf8
OS
91 assert(f);
92
93 if (!f->writable)
94 return -EPERM;
95
96 if (!(f->fd >= 0 && f->header))
97 return -EINVAL;
98
fa6ac760
LP
99 if (mmap_cache_got_sigbus(f->mmap, f->fd))
100 return -EIO;
101
26687bf8
OS
102 switch(f->header->state) {
103 case STATE_ONLINE:
104 return 0;
105
106 case STATE_OFFLINE:
107 f->header->state = STATE_ONLINE;
108 fsync(f->fd);
109 return 0;
110
111 default:
112 return -EINVAL;
113 }
114}
115
116int journal_file_set_offline(JournalFile *f) {
117 assert(f);
118
119 if (!f->writable)
120 return -EPERM;
121
122 if (!(f->fd >= 0 && f->header))
123 return -EINVAL;
124
125 if (f->header->state != STATE_ONLINE)
126 return 0;
127
128 fsync(f->fd);
129
fa6ac760
LP
130 if (mmap_cache_got_sigbus(f->mmap, f->fd))
131 return -EIO;
132
26687bf8
OS
133 f->header->state = STATE_OFFLINE;
134
fa6ac760
LP
135 if (mmap_cache_got_sigbus(f->mmap, f->fd))
136 return -EIO;
137
26687bf8
OS
138 fsync(f->fd);
139
140 return 0;
141}
142
804ae586 143JournalFile* journal_file_close(JournalFile *f) {
de190aef 144 assert(f);
cec736d2 145
feb12d3e 146#ifdef HAVE_GCRYPT
b0af6f41 147 /* Write the final tag */
c586dbf1 148 if (f->seal && f->writable)
b0af6f41 149 journal_file_append_tag(f);
feb12d3e 150#endif
b0af6f41 151
26687bf8 152 journal_file_set_offline(f);
cec736d2 153
fa6ac760
LP
154 if (f->mmap && f->fd >= 0)
155 mmap_cache_close_fd(f->mmap, f->fd);
cec736d2 156
11689d2a
LP
157 if (f->fd >= 0 && f->defrag_on_close) {
158
159 /* Be friendly to btrfs: turn COW back on again now,
160 * and defragment the file. We won't write to the file
161 * ever again, hence remove all fragmentation, and
162 * reenable all the good bits COW usually provides
163 * (such as data checksumming). */
164
1ed8f8c1 165 (void) chattr_fd(f->fd, 0, FS_NOCOW_FL);
11689d2a
LP
166 (void) btrfs_defrag_fd(f->fd);
167 }
f27a3864 168
03e334a1 169 safe_close(f->fd);
cec736d2 170 free(f->path);
807e17f0 171
16e9f408
LP
172 if (f->mmap)
173 mmap_cache_unref(f->mmap);
174
4743015d 175 ordered_hashmap_free_free(f->chain_cache);
a4bcff5b 176
d89c8fdf 177#if defined(HAVE_XZ) || defined(HAVE_LZ4)
807e17f0
LP
178 free(f->compress_buffer);
179#endif
180
7560fffc 181#ifdef HAVE_GCRYPT
baed47c3
LP
182 if (f->fss_file)
183 munmap(f->fss_file, PAGE_ALIGN(f->fss_file_size));
dc4ebc07 184 else
b7c9ae91
LP
185 free(f->fsprg_state);
186
187 free(f->fsprg_seed);
7560fffc
LP
188
189 if (f->hmac)
190 gcry_md_close(f->hmac);
191#endif
192
cec736d2 193 free(f);
804ae586 194 return NULL;
cec736d2
LP
195}
196
0ac38b70 197static int journal_file_init_header(JournalFile *f, JournalFile *template) {
d89c8fdf 198 Header h = {};
cec736d2
LP
199 ssize_t k;
200 int r;
201
202 assert(f);
203
7560fffc 204 memcpy(h.signature, HEADER_SIGNATURE, 8);
23b0b2b2 205 h.header_size = htole64(ALIGN64(sizeof(h)));
cec736d2 206
d89c8fdf
ZJS
207 h.incompatible_flags |= htole32(
208 f->compress_xz * HEADER_INCOMPATIBLE_COMPRESSED_XZ |
209 f->compress_lz4 * HEADER_INCOMPATIBLE_COMPRESSED_LZ4);
7560fffc 210
d89c8fdf
ZJS
211 h.compatible_flags = htole32(
212 f->seal * HEADER_COMPATIBLE_SEALED);
7560fffc 213
cec736d2
LP
214 r = sd_id128_randomize(&h.file_id);
215 if (r < 0)
216 return r;
217
0ac38b70
LP
218 if (template) {
219 h.seqnum_id = template->header->seqnum_id;
beec0085 220 h.tail_entry_seqnum = template->header->tail_entry_seqnum;
0ac38b70
LP
221 } else
222 h.seqnum_id = h.file_id;
cec736d2
LP
223
224 k = pwrite(f->fd, &h, sizeof(h), 0);
225 if (k < 0)
226 return -errno;
227
228 if (k != sizeof(h))
229 return -EIO;
230
231 return 0;
232}
233
234static int journal_file_refresh_header(JournalFile *f) {
de190aef 235 sd_id128_t boot_id;
fa6ac760 236 int r;
cec736d2
LP
237
238 assert(f);
239
240 r = sd_id128_get_machine(&f->header->machine_id);
241 if (r < 0)
242 return r;
243
de190aef 244 r = sd_id128_get_boot(&boot_id);
cec736d2
LP
245 if (r < 0)
246 return r;
247
de190aef
LP
248 if (sd_id128_equal(boot_id, f->header->boot_id))
249 f->tail_entry_monotonic_valid = true;
250
251 f->header->boot_id = boot_id;
252
fa6ac760 253 r = journal_file_set_online(f);
b788cc23 254
7560fffc 255 /* Sync the online state to disk */
a676e665 256 fsync(f->fd);
b788cc23 257
fa6ac760 258 return r;
cec736d2
LP
259}
260
261static int journal_file_verify_header(JournalFile *f) {
d89c8fdf
ZJS
262 uint32_t flags;
263
cec736d2
LP
264 assert(f);
265
7560fffc 266 if (memcmp(f->header->signature, HEADER_SIGNATURE, 8))
cec736d2
LP
267 return -EBADMSG;
268
7560fffc
LP
269 /* In both read and write mode we refuse to open files with
270 * incompatible flags we don't know */
d89c8fdf
ZJS
271 flags = le32toh(f->header->incompatible_flags);
272 if (flags & ~HEADER_INCOMPATIBLE_SUPPORTED) {
273 if (flags & ~HEADER_INCOMPATIBLE_ANY)
274 log_debug("Journal file %s has unknown incompatible flags %"PRIx32,
275 f->path, flags & ~HEADER_INCOMPATIBLE_ANY);
276 flags = (flags & HEADER_INCOMPATIBLE_ANY) & ~HEADER_INCOMPATIBLE_SUPPORTED;
277 if (flags)
278 log_debug("Journal file %s uses incompatible flags %"PRIx32
279 " disabled at compilation time.", f->path, flags);
cec736d2 280 return -EPROTONOSUPPORT;
d89c8fdf 281 }
cec736d2 282
7560fffc
LP
283 /* When open for writing we refuse to open files with
284 * compatible flags, too */
d89c8fdf
ZJS
285 flags = le32toh(f->header->compatible_flags);
286 if (f->writable && (flags & ~HEADER_COMPATIBLE_SUPPORTED)) {
287 if (flags & ~HEADER_COMPATIBLE_ANY)
288 log_debug("Journal file %s has unknown compatible flags %"PRIx32,
289 f->path, flags & ~HEADER_COMPATIBLE_ANY);
290 flags = (flags & HEADER_COMPATIBLE_ANY) & ~HEADER_COMPATIBLE_SUPPORTED;
291 if (flags)
292 log_debug("Journal file %s uses compatible flags %"PRIx32
293 " disabled at compilation time.", f->path, flags);
294 return -EPROTONOSUPPORT;
7560fffc
LP
295 }
296
db11ac1a
LP
297 if (f->header->state >= _STATE_MAX)
298 return -EBADMSG;
299
dca6219e
LP
300 /* The first addition was n_data, so check that we are at least this large */
301 if (le64toh(f->header->header_size) < HEADER_SIZE_MIN)
23b0b2b2
LP
302 return -EBADMSG;
303
8088cbd3 304 if (JOURNAL_HEADER_SEALED(f->header) && !JOURNAL_HEADER_CONTAINS(f->header, n_entry_arrays))
beec0085
LP
305 return -EBADMSG;
306
db11ac1a
LP
307 if ((le64toh(f->header->header_size) + le64toh(f->header->arena_size)) > (uint64_t) f->last_stat.st_size)
308 return -ENODATA;
309
310 if (le64toh(f->header->tail_object_offset) > (le64toh(f->header->header_size) + le64toh(f->header->arena_size)))
311 return -ENODATA;
312
7762e02b
LP
313 if (!VALID64(le64toh(f->header->data_hash_table_offset)) ||
314 !VALID64(le64toh(f->header->field_hash_table_offset)) ||
315 !VALID64(le64toh(f->header->tail_object_offset)) ||
316 !VALID64(le64toh(f->header->entry_array_offset)))
317 return -ENODATA;
318
cec736d2 319 if (f->writable) {
ccdbaf91 320 uint8_t state;
cec736d2
LP
321 sd_id128_t machine_id;
322 int r;
323
324 r = sd_id128_get_machine(&machine_id);
325 if (r < 0)
326 return r;
327
328 if (!sd_id128_equal(machine_id, f->header->machine_id))
329 return -EHOSTDOWN;
330
de190aef 331 state = f->header->state;
cec736d2 332
71fa6f00
LP
333 if (state == STATE_ONLINE) {
334 log_debug("Journal file %s is already online. Assuming unclean closing.", f->path);
335 return -EBUSY;
336 } else if (state == STATE_ARCHIVED)
cec736d2 337 return -ESHUTDOWN;
71fa6f00 338 else if (state != STATE_OFFLINE) {
8facc349 339 log_debug("Journal file %s has unknown state %i.", f->path, state);
71fa6f00
LP
340 return -EBUSY;
341 }
cec736d2
LP
342 }
343
d89c8fdf
ZJS
344 f->compress_xz = JOURNAL_HEADER_COMPRESSED_XZ(f->header);
345 f->compress_lz4 = JOURNAL_HEADER_COMPRESSED_LZ4(f->header);
c586dbf1 346
f1889c91 347 f->seal = JOURNAL_HEADER_SEALED(f->header);
7560fffc 348
cec736d2
LP
349 return 0;
350}
351
2678031a
LP
352static int journal_file_fstat(JournalFile *f) {
353 assert(f);
354 assert(f->fd >= 0);
355
356 if (fstat(f->fd, &f->last_stat) < 0)
357 return -errno;
358
359 f->last_stat_usec = now(CLOCK_MONOTONIC);
360
361 /* Refuse appending to files that are already deleted */
362 if (f->last_stat.st_nlink <= 0)
363 return -EIDRM;
364
365 return 0;
366}
367
cec736d2 368static int journal_file_allocate(JournalFile *f, uint64_t offset, uint64_t size) {
eda4b58b 369 uint64_t old_size, new_size;
fec2aa2f 370 int r;
cec736d2
LP
371
372 assert(f);
373
cec736d2 374 /* We assume that this file is not sparse, and we know that
38ac38b2 375 * for sure, since we always call posix_fallocate()
cec736d2
LP
376 * ourselves */
377
fa6ac760
LP
378 if (mmap_cache_got_sigbus(f->mmap, f->fd))
379 return -EIO;
380
cec736d2 381 old_size =
23b0b2b2 382 le64toh(f->header->header_size) +
cec736d2
LP
383 le64toh(f->header->arena_size);
384
bc85bfee 385 new_size = PAGE_ALIGN(offset + size);
23b0b2b2
LP
386 if (new_size < le64toh(f->header->header_size))
387 new_size = le64toh(f->header->header_size);
bc85bfee 388
2678031a
LP
389 if (new_size <= old_size) {
390
391 /* We already pre-allocated enough space, but before
392 * we write to it, let's check with fstat() if the
393 * file got deleted, in order make sure we don't throw
394 * away the data immediately. Don't check fstat() for
395 * all writes though, but only once ever 10s. */
396
397 if (f->last_stat_usec + LAST_STAT_REFRESH_USEC > now(CLOCK_MONOTONIC))
398 return 0;
399
400 return journal_file_fstat(f);
401 }
402
403 /* Allocate more space. */
cec736d2 404
a676e665 405 if (f->metrics.max_size > 0 && new_size > f->metrics.max_size)
bc85bfee 406 return -E2BIG;
cec736d2 407
a676e665 408 if (new_size > f->metrics.min_size && f->metrics.keep_free > 0) {
cec736d2
LP
409 struct statvfs svfs;
410
411 if (fstatvfs(f->fd, &svfs) >= 0) {
412 uint64_t available;
413
070052ab 414 available = LESS_BY((uint64_t) svfs.f_bfree * (uint64_t) svfs.f_bsize, f->metrics.keep_free);
cec736d2
LP
415
416 if (new_size - old_size > available)
417 return -E2BIG;
418 }
419 }
420
eda4b58b
LP
421 /* Increase by larger blocks at once */
422 new_size = ((new_size+FILE_SIZE_INCREASE-1) / FILE_SIZE_INCREASE) * FILE_SIZE_INCREASE;
423 if (f->metrics.max_size > 0 && new_size > f->metrics.max_size)
424 new_size = f->metrics.max_size;
425
bc85bfee
LP
426 /* Note that the glibc fallocate() fallback is very
427 inefficient, hence we try to minimize the allocation area
428 as we can. */
fec2aa2f
GV
429 r = posix_fallocate(f->fd, old_size, new_size - old_size);
430 if (r != 0)
431 return -r;
cec736d2 432
23b0b2b2 433 f->header->arena_size = htole64(new_size - le64toh(f->header->header_size));
cec736d2 434
2678031a 435 return journal_file_fstat(f);
cec736d2
LP
436}
437
78519831 438static unsigned type_to_context(ObjectType type) {
d3d3208f 439 /* One context for each type, plus one catch-all for the rest */
69adae51 440 assert_cc(_OBJECT_TYPE_MAX <= MMAP_CACHE_MAX_CONTEXTS);
fa6ac760 441 assert_cc(CONTEXT_HEADER < MMAP_CACHE_MAX_CONTEXTS);
d05089d8 442 return type > OBJECT_UNUSED && type < _OBJECT_TYPE_MAX ? type : 0;
d3d3208f
MS
443}
444
7a9dabea 445static int journal_file_move_to(JournalFile *f, ObjectType type, bool keep_always, uint64_t offset, uint64_t size, void **ret) {
2678031a
LP
446 int r;
447
cec736d2 448 assert(f);
cec736d2
LP
449 assert(ret);
450
7762e02b
LP
451 if (size <= 0)
452 return -EINVAL;
453
2a59ea54 454 /* Avoid SIGBUS on invalid accesses */
4bbdcdb3
LP
455 if (offset + size > (uint64_t) f->last_stat.st_size) {
456 /* Hmm, out of range? Let's refresh the fstat() data
457 * first, before we trust that check. */
458
2678031a
LP
459 r = journal_file_fstat(f);
460 if (r < 0)
461 return r;
462
463 if (offset + size > (uint64_t) f->last_stat.st_size)
4bbdcdb3
LP
464 return -EADDRNOTAVAIL;
465 }
466
7a9dabea 467 return mmap_cache_get(f->mmap, f->fd, f->prot, type_to_context(type), keep_always, offset, size, &f->last_stat, ret);
cec736d2
LP
468}
469
16e9f408
LP
470static uint64_t minimum_header_size(Object *o) {
471
b8e891e6 472 static const uint64_t table[] = {
16e9f408
LP
473 [OBJECT_DATA] = sizeof(DataObject),
474 [OBJECT_FIELD] = sizeof(FieldObject),
475 [OBJECT_ENTRY] = sizeof(EntryObject),
476 [OBJECT_DATA_HASH_TABLE] = sizeof(HashTableObject),
477 [OBJECT_FIELD_HASH_TABLE] = sizeof(HashTableObject),
478 [OBJECT_ENTRY_ARRAY] = sizeof(EntryArrayObject),
479 [OBJECT_TAG] = sizeof(TagObject),
480 };
481
482 if (o->object.type >= ELEMENTSOF(table) || table[o->object.type] <= 0)
483 return sizeof(ObjectHeader);
484
485 return table[o->object.type];
486}
487
78519831 488int journal_file_move_to_object(JournalFile *f, ObjectType type, uint64_t offset, Object **ret) {
cec736d2
LP
489 int r;
490 void *t;
491 Object *o;
492 uint64_t s;
493
494 assert(f);
495 assert(ret);
496
db11ac1a
LP
497 /* Objects may only be located at multiple of 64 bit */
498 if (!VALID64(offset))
499 return -EFAULT;
500
7a9dabea 501 r = journal_file_move_to(f, type, false, offset, sizeof(ObjectHeader), &t);
cec736d2
LP
502 if (r < 0)
503 return r;
504
505 o = (Object*) t;
506 s = le64toh(o->object.size);
507
508 if (s < sizeof(ObjectHeader))
509 return -EBADMSG;
510
16e9f408
LP
511 if (o->object.type <= OBJECT_UNUSED)
512 return -EBADMSG;
513
514 if (s < minimum_header_size(o))
515 return -EBADMSG;
516
d05089d8 517 if (type > OBJECT_UNUSED && o->object.type != type)
cec736d2
LP
518 return -EBADMSG;
519
520 if (s > sizeof(ObjectHeader)) {
7a9dabea 521 r = journal_file_move_to(f, type, false, offset, s, &t);
cec736d2
LP
522 if (r < 0)
523 return r;
524
525 o = (Object*) t;
526 }
527
cec736d2
LP
528 *ret = o;
529 return 0;
530}
531
d98cc1f2 532static uint64_t journal_file_entry_seqnum(JournalFile *f, uint64_t *seqnum) {
cec736d2
LP
533 uint64_t r;
534
535 assert(f);
536
beec0085 537 r = le64toh(f->header->tail_entry_seqnum) + 1;
c2373f84
LP
538
539 if (seqnum) {
de190aef 540 /* If an external seqnum counter was passed, we update
c2373f84
LP
541 * both the local and the external one, and set it to
542 * the maximum of both */
543
544 if (*seqnum + 1 > r)
545 r = *seqnum + 1;
546
547 *seqnum = r;
548 }
549
beec0085 550 f->header->tail_entry_seqnum = htole64(r);
cec736d2 551
beec0085
LP
552 if (f->header->head_entry_seqnum == 0)
553 f->header->head_entry_seqnum = htole64(r);
de190aef 554
cec736d2
LP
555 return r;
556}
557
78519831 558int journal_file_append_object(JournalFile *f, ObjectType type, uint64_t size, Object **ret, uint64_t *offset) {
cec736d2
LP
559 int r;
560 uint64_t p;
561 Object *tail, *o;
562 void *t;
563
564 assert(f);
d05089d8 565 assert(type > OBJECT_UNUSED && type < _OBJECT_TYPE_MAX);
cec736d2
LP
566 assert(size >= sizeof(ObjectHeader));
567 assert(offset);
568 assert(ret);
569
26687bf8
OS
570 r = journal_file_set_online(f);
571 if (r < 0)
572 return r;
573
cec736d2 574 p = le64toh(f->header->tail_object_offset);
cec736d2 575 if (p == 0)
23b0b2b2 576 p = le64toh(f->header->header_size);
cec736d2 577 else {
d05089d8 578 r = journal_file_move_to_object(f, OBJECT_UNUSED, p, &tail);
cec736d2
LP
579 if (r < 0)
580 return r;
581
582 p += ALIGN64(le64toh(tail->object.size));
583 }
584
585 r = journal_file_allocate(f, p, size);
586 if (r < 0)
587 return r;
588
fcde2389 589 r = journal_file_move_to(f, type, false, p, size, &t);
cec736d2
LP
590 if (r < 0)
591 return r;
592
593 o = (Object*) t;
594
595 zero(o->object);
de190aef 596 o->object.type = type;
cec736d2
LP
597 o->object.size = htole64(size);
598
599 f->header->tail_object_offset = htole64(p);
cec736d2
LP
600 f->header->n_objects = htole64(le64toh(f->header->n_objects) + 1);
601
602 *ret = o;
603 *offset = p;
604
605 return 0;
606}
607
de190aef 608static int journal_file_setup_data_hash_table(JournalFile *f) {
cec736d2
LP
609 uint64_t s, p;
610 Object *o;
611 int r;
612
613 assert(f);
614
070052ab
LP
615 /* We estimate that we need 1 hash table entry per 768 bytes
616 of journal file and we want to make sure we never get
617 beyond 75% fill level. Calculate the hash table size for
618 the maximum file size based on these metrics. */
4a92baf3 619
dfabe643 620 s = (f->metrics.max_size * 4 / 768 / 3) * sizeof(HashItem);
4a92baf3
LP
621 if (s < DEFAULT_DATA_HASH_TABLE_SIZE)
622 s = DEFAULT_DATA_HASH_TABLE_SIZE;
623
507f22bd 624 log_debug("Reserving %"PRIu64" entries in hash table.", s / sizeof(HashItem));
4a92baf3 625
de190aef
LP
626 r = journal_file_append_object(f,
627 OBJECT_DATA_HASH_TABLE,
628 offsetof(Object, hash_table.items) + s,
629 &o, &p);
cec736d2
LP
630 if (r < 0)
631 return r;
632
29804cc1 633 memzero(o->hash_table.items, s);
cec736d2 634
de190aef
LP
635 f->header->data_hash_table_offset = htole64(p + offsetof(Object, hash_table.items));
636 f->header->data_hash_table_size = htole64(s);
cec736d2
LP
637
638 return 0;
639}
640
de190aef 641static int journal_file_setup_field_hash_table(JournalFile *f) {
cec736d2
LP
642 uint64_t s, p;
643 Object *o;
644 int r;
645
646 assert(f);
647
3c1668da
LP
648 /* We use a fixed size hash table for the fields as this
649 * number should grow very slowly only */
650
de190aef
LP
651 s = DEFAULT_FIELD_HASH_TABLE_SIZE;
652 r = journal_file_append_object(f,
653 OBJECT_FIELD_HASH_TABLE,
654 offsetof(Object, hash_table.items) + s,
655 &o, &p);
cec736d2
LP
656 if (r < 0)
657 return r;
658
29804cc1 659 memzero(o->hash_table.items, s);
cec736d2 660
de190aef
LP
661 f->header->field_hash_table_offset = htole64(p + offsetof(Object, hash_table.items));
662 f->header->field_hash_table_size = htole64(s);
cec736d2
LP
663
664 return 0;
665}
666
dade37d4 667int journal_file_map_data_hash_table(JournalFile *f) {
cec736d2
LP
668 uint64_t s, p;
669 void *t;
670 int r;
671
672 assert(f);
673
dade37d4
LP
674 if (f->data_hash_table)
675 return 0;
676
de190aef
LP
677 p = le64toh(f->header->data_hash_table_offset);
678 s = le64toh(f->header->data_hash_table_size);
cec736d2 679
de190aef 680 r = journal_file_move_to(f,
16e9f408 681 OBJECT_DATA_HASH_TABLE,
fcde2389 682 true,
de190aef
LP
683 p, s,
684 &t);
cec736d2
LP
685 if (r < 0)
686 return r;
687
de190aef 688 f->data_hash_table = t;
cec736d2
LP
689 return 0;
690}
691
dade37d4 692int journal_file_map_field_hash_table(JournalFile *f) {
cec736d2
LP
693 uint64_t s, p;
694 void *t;
695 int r;
696
697 assert(f);
698
dade37d4
LP
699 if (f->field_hash_table)
700 return 0;
701
de190aef
LP
702 p = le64toh(f->header->field_hash_table_offset);
703 s = le64toh(f->header->field_hash_table_size);
cec736d2 704
de190aef 705 r = journal_file_move_to(f,
16e9f408 706 OBJECT_FIELD_HASH_TABLE,
fcde2389 707 true,
de190aef
LP
708 p, s,
709 &t);
cec736d2
LP
710 if (r < 0)
711 return r;
712
de190aef 713 f->field_hash_table = t;
cec736d2
LP
714 return 0;
715}
716
3c1668da
LP
717static int journal_file_link_field(
718 JournalFile *f,
719 Object *o,
720 uint64_t offset,
721 uint64_t hash) {
722
805d1486 723 uint64_t p, h, m;
3c1668da
LP
724 int r;
725
726 assert(f);
727 assert(o);
728 assert(offset > 0);
729
730 if (o->object.type != OBJECT_FIELD)
731 return -EINVAL;
732
805d1486
LP
733 m = le64toh(f->header->field_hash_table_size) / sizeof(HashItem);
734 if (m <= 0)
735 return -EBADMSG;
3c1668da 736
805d1486 737 /* This might alter the window we are looking at */
3c1668da
LP
738 o->field.next_hash_offset = o->field.head_data_offset = 0;
739
805d1486 740 h = hash % m;
3c1668da
LP
741 p = le64toh(f->field_hash_table[h].tail_hash_offset);
742 if (p == 0)
743 f->field_hash_table[h].head_hash_offset = htole64(offset);
744 else {
745 r = journal_file_move_to_object(f, OBJECT_FIELD, p, &o);
746 if (r < 0)
747 return r;
748
749 o->field.next_hash_offset = htole64(offset);
750 }
751
752 f->field_hash_table[h].tail_hash_offset = htole64(offset);
753
754 if (JOURNAL_HEADER_CONTAINS(f->header, n_fields))
755 f->header->n_fields = htole64(le64toh(f->header->n_fields) + 1);
756
757 return 0;
758}
759
760static int journal_file_link_data(
761 JournalFile *f,
762 Object *o,
763 uint64_t offset,
764 uint64_t hash) {
765
805d1486 766 uint64_t p, h, m;
cec736d2
LP
767 int r;
768
769 assert(f);
770 assert(o);
771 assert(offset > 0);
b588975f
LP
772
773 if (o->object.type != OBJECT_DATA)
774 return -EINVAL;
cec736d2 775
805d1486
LP
776 m = le64toh(f->header->data_hash_table_size) / sizeof(HashItem);
777 if (m <= 0)
778 return -EBADMSG;
48496df6 779
805d1486 780 /* This might alter the window we are looking at */
de190aef
LP
781 o->data.next_hash_offset = o->data.next_field_offset = 0;
782 o->data.entry_offset = o->data.entry_array_offset = 0;
783 o->data.n_entries = 0;
cec736d2 784
805d1486 785 h = hash % m;
8db4213e 786 p = le64toh(f->data_hash_table[h].tail_hash_offset);
3c1668da 787 if (p == 0)
cec736d2 788 /* Only entry in the hash table is easy */
de190aef 789 f->data_hash_table[h].head_hash_offset = htole64(offset);
3c1668da 790 else {
48496df6
LP
791 /* Move back to the previous data object, to patch in
792 * pointer */
cec736d2 793
de190aef 794 r = journal_file_move_to_object(f, OBJECT_DATA, p, &o);
cec736d2
LP
795 if (r < 0)
796 return r;
797
de190aef 798 o->data.next_hash_offset = htole64(offset);
cec736d2
LP
799 }
800
de190aef 801 f->data_hash_table[h].tail_hash_offset = htole64(offset);
cec736d2 802
dca6219e
LP
803 if (JOURNAL_HEADER_CONTAINS(f->header, n_data))
804 f->header->n_data = htole64(le64toh(f->header->n_data) + 1);
805
cec736d2
LP
806 return 0;
807}
808
3c1668da
LP
809int journal_file_find_field_object_with_hash(
810 JournalFile *f,
811 const void *field, uint64_t size, uint64_t hash,
812 Object **ret, uint64_t *offset) {
813
805d1486 814 uint64_t p, osize, h, m;
3c1668da
LP
815 int r;
816
817 assert(f);
818 assert(field && size > 0);
819
dade37d4
LP
820 /* If the field hash table is empty, we can't find anything */
821 if (le64toh(f->header->field_hash_table_size) <= 0)
822 return 0;
823
824 /* Map the field hash table, if it isn't mapped yet. */
825 r = journal_file_map_field_hash_table(f);
826 if (r < 0)
827 return r;
828
3c1668da
LP
829 osize = offsetof(Object, field.payload) + size;
830
805d1486 831 m = le64toh(f->header->field_hash_table_size) / sizeof(HashItem);
805d1486 832 if (m <= 0)
3c1668da
LP
833 return -EBADMSG;
834
805d1486 835 h = hash % m;
3c1668da
LP
836 p = le64toh(f->field_hash_table[h].head_hash_offset);
837
838 while (p > 0) {
839 Object *o;
840
841 r = journal_file_move_to_object(f, OBJECT_FIELD, p, &o);
842 if (r < 0)
843 return r;
844
845 if (le64toh(o->field.hash) == hash &&
846 le64toh(o->object.size) == osize &&
847 memcmp(o->field.payload, field, size) == 0) {
848
849 if (ret)
850 *ret = o;
851 if (offset)
852 *offset = p;
853
854 return 1;
855 }
856
857 p = le64toh(o->field.next_hash_offset);
858 }
859
860 return 0;
861}
862
863int journal_file_find_field_object(
864 JournalFile *f,
865 const void *field, uint64_t size,
866 Object **ret, uint64_t *offset) {
867
868 uint64_t hash;
869
870 assert(f);
871 assert(field && size > 0);
872
873 hash = hash64(field, size);
874
875 return journal_file_find_field_object_with_hash(f,
876 field, size, hash,
877 ret, offset);
878}
879
de190aef
LP
880int journal_file_find_data_object_with_hash(
881 JournalFile *f,
882 const void *data, uint64_t size, uint64_t hash,
883 Object **ret, uint64_t *offset) {
48496df6 884
805d1486 885 uint64_t p, osize, h, m;
cec736d2
LP
886 int r;
887
888 assert(f);
889 assert(data || size == 0);
890
dade37d4
LP
891 /* If there's no data hash table, then there's no entry. */
892 if (le64toh(f->header->data_hash_table_size) <= 0)
893 return 0;
894
895 /* Map the data hash table, if it isn't mapped yet. */
896 r = journal_file_map_data_hash_table(f);
897 if (r < 0)
898 return r;
899
cec736d2
LP
900 osize = offsetof(Object, data.payload) + size;
901
805d1486
LP
902 m = le64toh(f->header->data_hash_table_size) / sizeof(HashItem);
903 if (m <= 0)
bc85bfee
LP
904 return -EBADMSG;
905
805d1486 906 h = hash % m;
de190aef 907 p = le64toh(f->data_hash_table[h].head_hash_offset);
cec736d2 908
de190aef
LP
909 while (p > 0) {
910 Object *o;
cec736d2 911
de190aef 912 r = journal_file_move_to_object(f, OBJECT_DATA, p, &o);
cec736d2
LP
913 if (r < 0)
914 return r;
915
807e17f0 916 if (le64toh(o->data.hash) != hash)
85a131e8 917 goto next;
807e17f0 918
d89c8fdf 919 if (o->object.flags & OBJECT_COMPRESSION_MASK) {
3b1a55e1 920#if defined(HAVE_XZ) || defined(HAVE_LZ4)
fa1c4b51 921 uint64_t l;
a7f7d1bd 922 size_t rsize = 0;
cec736d2 923
807e17f0
LP
924 l = le64toh(o->object.size);
925 if (l <= offsetof(Object, data.payload))
cec736d2
LP
926 return -EBADMSG;
927
807e17f0
LP
928 l -= offsetof(Object, data.payload);
929
d89c8fdf
ZJS
930 r = decompress_blob(o->object.flags & OBJECT_COMPRESSION_MASK,
931 o->data.payload, l, &f->compress_buffer, &f->compress_buffer_size, &rsize, 0);
932 if (r < 0)
933 return r;
807e17f0 934
b785c858 935 if (rsize == size &&
807e17f0
LP
936 memcmp(f->compress_buffer, data, size) == 0) {
937
938 if (ret)
939 *ret = o;
940
941 if (offset)
942 *offset = p;
943
944 return 1;
945 }
3b1a55e1
ZJS
946#else
947 return -EPROTONOSUPPORT;
948#endif
807e17f0
LP
949 } else if (le64toh(o->object.size) == osize &&
950 memcmp(o->data.payload, data, size) == 0) {
951
cec736d2
LP
952 if (ret)
953 *ret = o;
954
955 if (offset)
956 *offset = p;
957
de190aef 958 return 1;
cec736d2
LP
959 }
960
85a131e8 961 next:
cec736d2
LP
962 p = le64toh(o->data.next_hash_offset);
963 }
964
de190aef
LP
965 return 0;
966}
967
968int journal_file_find_data_object(
969 JournalFile *f,
970 const void *data, uint64_t size,
971 Object **ret, uint64_t *offset) {
972
973 uint64_t hash;
974
975 assert(f);
976 assert(data || size == 0);
977
978 hash = hash64(data, size);
979
980 return journal_file_find_data_object_with_hash(f,
981 data, size, hash,
982 ret, offset);
983}
984
3c1668da
LP
985static int journal_file_append_field(
986 JournalFile *f,
987 const void *field, uint64_t size,
988 Object **ret, uint64_t *offset) {
989
990 uint64_t hash, p;
991 uint64_t osize;
992 Object *o;
993 int r;
994
995 assert(f);
996 assert(field && size > 0);
997
998 hash = hash64(field, size);
999
1000 r = journal_file_find_field_object_with_hash(f, field, size, hash, &o, &p);
1001 if (r < 0)
1002 return r;
1003 else if (r > 0) {
1004
1005 if (ret)
1006 *ret = o;
1007
1008 if (offset)
1009 *offset = p;
1010
1011 return 0;
1012 }
1013
1014 osize = offsetof(Object, field.payload) + size;
1015 r = journal_file_append_object(f, OBJECT_FIELD, osize, &o, &p);
8c92d4bb
LP
1016 if (r < 0)
1017 return r;
3c1668da
LP
1018
1019 o->field.hash = htole64(hash);
1020 memcpy(o->field.payload, field, size);
1021
1022 r = journal_file_link_field(f, o, p, hash);
1023 if (r < 0)
1024 return r;
1025
1026 /* The linking might have altered the window, so let's
1027 * refresh our pointer */
1028 r = journal_file_move_to_object(f, OBJECT_FIELD, p, &o);
1029 if (r < 0)
1030 return r;
1031
1032#ifdef HAVE_GCRYPT
1033 r = journal_file_hmac_put_object(f, OBJECT_FIELD, o, p);
1034 if (r < 0)
1035 return r;
1036#endif
1037
1038 if (ret)
1039 *ret = o;
1040
1041 if (offset)
1042 *offset = p;
1043
1044 return 0;
1045}
1046
48496df6
LP
1047static int journal_file_append_data(
1048 JournalFile *f,
1049 const void *data, uint64_t size,
1050 Object **ret, uint64_t *offset) {
1051
de190aef
LP
1052 uint64_t hash, p;
1053 uint64_t osize;
1054 Object *o;
d89c8fdf 1055 int r, compression = 0;
3c1668da 1056 const void *eq;
de190aef
LP
1057
1058 assert(f);
1059 assert(data || size == 0);
1060
1061 hash = hash64(data, size);
1062
1063 r = journal_file_find_data_object_with_hash(f, data, size, hash, &o, &p);
1064 if (r < 0)
1065 return r;
0240c603 1066 if (r > 0) {
de190aef
LP
1067
1068 if (ret)
1069 *ret = o;
1070
1071 if (offset)
1072 *offset = p;
1073
1074 return 0;
1075 }
1076
1077 osize = offsetof(Object, data.payload) + size;
1078 r = journal_file_append_object(f, OBJECT_DATA, osize, &o, &p);
cec736d2
LP
1079 if (r < 0)
1080 return r;
1081
cec736d2 1082 o->data.hash = htole64(hash);
807e17f0 1083
d89c8fdf 1084#if defined(HAVE_XZ) || defined(HAVE_LZ4)
d1afbcd2 1085 if (JOURNAL_FILE_COMPRESS(f) && size >= COMPRESSION_SIZE_THRESHOLD) {
a7f7d1bd 1086 size_t rsize = 0;
807e17f0 1087
d89c8fdf 1088 compression = compress_blob(data, size, o->data.payload, &rsize);
807e17f0 1089
d1afbcd2 1090 if (compression >= 0) {
807e17f0 1091 o->object.size = htole64(offsetof(Object, data.payload) + rsize);
d89c8fdf 1092 o->object.flags |= compression;
807e17f0 1093
fa1c4b51 1094 log_debug("Compressed data object %"PRIu64" -> %zu using %s",
d89c8fdf 1095 size, rsize, object_compressed_to_string(compression));
d1afbcd2
LP
1096 } else
1097 /* Compression didn't work, we don't really care why, let's continue without compression */
1098 compression = 0;
807e17f0
LP
1099 }
1100#endif
1101
d1afbcd2 1102 if (compression == 0 && size > 0)
807e17f0 1103 memcpy(o->data.payload, data, size);
cec736d2 1104
de190aef 1105 r = journal_file_link_data(f, o, p, hash);
cec736d2
LP
1106 if (r < 0)
1107 return r;
1108
48496df6
LP
1109 /* The linking might have altered the window, so let's
1110 * refresh our pointer */
1111 r = journal_file_move_to_object(f, OBJECT_DATA, p, &o);
1112 if (r < 0)
1113 return r;
1114
08c6f819
SL
1115 if (!data)
1116 eq = NULL;
1117 else
1118 eq = memchr(data, '=', size);
3c1668da 1119 if (eq && eq > data) {
748db592 1120 Object *fo = NULL;
3c1668da 1121 uint64_t fp;
3c1668da
LP
1122
1123 /* Create field object ... */
1124 r = journal_file_append_field(f, data, (uint8_t*) eq - (uint8_t*) data, &fo, &fp);
1125 if (r < 0)
1126 return r;
1127
1128 /* ... and link it in. */
1129 o->data.next_field_offset = fo->field.head_data_offset;
1130 fo->field.head_data_offset = le64toh(p);
1131 }
1132
5996c7c2
LP
1133#ifdef HAVE_GCRYPT
1134 r = journal_file_hmac_put_object(f, OBJECT_DATA, o, p);
1135 if (r < 0)
1136 return r;
1137#endif
1138
cec736d2
LP
1139 if (ret)
1140 *ret = o;
1141
1142 if (offset)
de190aef 1143 *offset = p;
cec736d2
LP
1144
1145 return 0;
1146}
1147
1148uint64_t journal_file_entry_n_items(Object *o) {
1149 assert(o);
b588975f
LP
1150
1151 if (o->object.type != OBJECT_ENTRY)
1152 return 0;
cec736d2
LP
1153
1154 return (le64toh(o->object.size) - offsetof(Object, entry.items)) / sizeof(EntryItem);
1155}
1156
0284adc6 1157uint64_t journal_file_entry_array_n_items(Object *o) {
de190aef 1158 assert(o);
b588975f
LP
1159
1160 if (o->object.type != OBJECT_ENTRY_ARRAY)
1161 return 0;
de190aef
LP
1162
1163 return (le64toh(o->object.size) - offsetof(Object, entry_array.items)) / sizeof(uint64_t);
1164}
1165
fb9a24b6
LP
1166uint64_t journal_file_hash_table_n_items(Object *o) {
1167 assert(o);
b588975f
LP
1168
1169 if (o->object.type != OBJECT_DATA_HASH_TABLE &&
1170 o->object.type != OBJECT_FIELD_HASH_TABLE)
1171 return 0;
fb9a24b6
LP
1172
1173 return (le64toh(o->object.size) - offsetof(Object, hash_table.items)) / sizeof(HashItem);
1174}
1175
de190aef 1176static int link_entry_into_array(JournalFile *f,
4fd052ae
FC
1177 le64_t *first,
1178 le64_t *idx,
de190aef 1179 uint64_t p) {
cec736d2 1180 int r;
de190aef
LP
1181 uint64_t n = 0, ap = 0, q, i, a, hidx;
1182 Object *o;
1183
cec736d2 1184 assert(f);
de190aef
LP
1185 assert(first);
1186 assert(idx);
1187 assert(p > 0);
cec736d2 1188
de190aef
LP
1189 a = le64toh(*first);
1190 i = hidx = le64toh(*idx);
1191 while (a > 0) {
1192
1193 r = journal_file_move_to_object(f, OBJECT_ENTRY_ARRAY, a, &o);
1194 if (r < 0)
1195 return r;
cec736d2 1196
de190aef
LP
1197 n = journal_file_entry_array_n_items(o);
1198 if (i < n) {
1199 o->entry_array.items[i] = htole64(p);
1200 *idx = htole64(hidx + 1);
1201 return 0;
1202 }
cec736d2 1203
de190aef
LP
1204 i -= n;
1205 ap = a;
1206 a = le64toh(o->entry_array.next_entry_array_offset);
1207 }
1208
1209 if (hidx > n)
1210 n = (hidx+1) * 2;
1211 else
1212 n = n * 2;
1213
1214 if (n < 4)
1215 n = 4;
1216
1217 r = journal_file_append_object(f, OBJECT_ENTRY_ARRAY,
1218 offsetof(Object, entry_array.items) + n * sizeof(uint64_t),
1219 &o, &q);
cec736d2
LP
1220 if (r < 0)
1221 return r;
1222
feb12d3e 1223#ifdef HAVE_GCRYPT
5996c7c2 1224 r = journal_file_hmac_put_object(f, OBJECT_ENTRY_ARRAY, o, q);
b0af6f41
LP
1225 if (r < 0)
1226 return r;
feb12d3e 1227#endif
b0af6f41 1228
de190aef 1229 o->entry_array.items[i] = htole64(p);
cec736d2 1230
de190aef 1231 if (ap == 0)
7be3aa17 1232 *first = htole64(q);
cec736d2 1233 else {
de190aef 1234 r = journal_file_move_to_object(f, OBJECT_ENTRY_ARRAY, ap, &o);
cec736d2
LP
1235 if (r < 0)
1236 return r;
1237
de190aef
LP
1238 o->entry_array.next_entry_array_offset = htole64(q);
1239 }
cec736d2 1240
2dee23eb
LP
1241 if (JOURNAL_HEADER_CONTAINS(f->header, n_entry_arrays))
1242 f->header->n_entry_arrays = htole64(le64toh(f->header->n_entry_arrays) + 1);
1243
de190aef
LP
1244 *idx = htole64(hidx + 1);
1245
1246 return 0;
1247}
cec736d2 1248
de190aef 1249static int link_entry_into_array_plus_one(JournalFile *f,
4fd052ae
FC
1250 le64_t *extra,
1251 le64_t *first,
1252 le64_t *idx,
de190aef
LP
1253 uint64_t p) {
1254
1255 int r;
1256
1257 assert(f);
1258 assert(extra);
1259 assert(first);
1260 assert(idx);
1261 assert(p > 0);
1262
1263 if (*idx == 0)
1264 *extra = htole64(p);
1265 else {
4fd052ae 1266 le64_t i;
de190aef 1267
7be3aa17 1268 i = htole64(le64toh(*idx) - 1);
de190aef
LP
1269 r = link_entry_into_array(f, first, &i, p);
1270 if (r < 0)
1271 return r;
cec736d2
LP
1272 }
1273
de190aef
LP
1274 *idx = htole64(le64toh(*idx) + 1);
1275 return 0;
1276}
1277
1278static int journal_file_link_entry_item(JournalFile *f, Object *o, uint64_t offset, uint64_t i) {
1279 uint64_t p;
1280 int r;
1281 assert(f);
1282 assert(o);
1283 assert(offset > 0);
1284
1285 p = le64toh(o->entry.items[i].object_offset);
1286 if (p == 0)
1287 return -EINVAL;
1288
1289 r = journal_file_move_to_object(f, OBJECT_DATA, p, &o);
cec736d2
LP
1290 if (r < 0)
1291 return r;
1292
de190aef
LP
1293 return link_entry_into_array_plus_one(f,
1294 &o->data.entry_offset,
1295 &o->data.entry_array_offset,
1296 &o->data.n_entries,
1297 offset);
cec736d2
LP
1298}
1299
1300static int journal_file_link_entry(JournalFile *f, Object *o, uint64_t offset) {
de190aef 1301 uint64_t n, i;
cec736d2
LP
1302 int r;
1303
1304 assert(f);
1305 assert(o);
1306 assert(offset > 0);
b588975f
LP
1307
1308 if (o->object.type != OBJECT_ENTRY)
1309 return -EINVAL;
cec736d2 1310
b788cc23
LP
1311 __sync_synchronize();
1312
cec736d2 1313 /* Link up the entry itself */
de190aef
LP
1314 r = link_entry_into_array(f,
1315 &f->header->entry_array_offset,
1316 &f->header->n_entries,
1317 offset);
1318 if (r < 0)
1319 return r;
cec736d2 1320
507f22bd 1321 /* log_debug("=> %s seqnr=%"PRIu64" n_entries=%"PRIu64, f->path, o->entry.seqnum, f->header->n_entries); */
cec736d2 1322
de190aef 1323 if (f->header->head_entry_realtime == 0)
0ac38b70 1324 f->header->head_entry_realtime = o->entry.realtime;
cec736d2 1325
0ac38b70 1326 f->header->tail_entry_realtime = o->entry.realtime;
de190aef
LP
1327 f->header->tail_entry_monotonic = o->entry.monotonic;
1328
1329 f->tail_entry_monotonic_valid = true;
cec736d2
LP
1330
1331 /* Link up the items */
1332 n = journal_file_entry_n_items(o);
1333 for (i = 0; i < n; i++) {
1334 r = journal_file_link_entry_item(f, o, offset, i);
1335 if (r < 0)
1336 return r;
1337 }
1338
cec736d2
LP
1339 return 0;
1340}
1341
1342static int journal_file_append_entry_internal(
1343 JournalFile *f,
1344 const dual_timestamp *ts,
1345 uint64_t xor_hash,
1346 const EntryItem items[], unsigned n_items,
de190aef 1347 uint64_t *seqnum,
cec736d2
LP
1348 Object **ret, uint64_t *offset) {
1349 uint64_t np;
1350 uint64_t osize;
1351 Object *o;
1352 int r;
1353
1354 assert(f);
1355 assert(items || n_items == 0);
de190aef 1356 assert(ts);
cec736d2
LP
1357
1358 osize = offsetof(Object, entry.items) + (n_items * sizeof(EntryItem));
1359
de190aef 1360 r = journal_file_append_object(f, OBJECT_ENTRY, osize, &o, &np);
cec736d2
LP
1361 if (r < 0)
1362 return r;
1363
d98cc1f2 1364 o->entry.seqnum = htole64(journal_file_entry_seqnum(f, seqnum));
cec736d2 1365 memcpy(o->entry.items, items, n_items * sizeof(EntryItem));
de190aef
LP
1366 o->entry.realtime = htole64(ts->realtime);
1367 o->entry.monotonic = htole64(ts->monotonic);
cec736d2
LP
1368 o->entry.xor_hash = htole64(xor_hash);
1369 o->entry.boot_id = f->header->boot_id;
1370
feb12d3e 1371#ifdef HAVE_GCRYPT
5996c7c2 1372 r = journal_file_hmac_put_object(f, OBJECT_ENTRY, o, np);
b0af6f41
LP
1373 if (r < 0)
1374 return r;
feb12d3e 1375#endif
b0af6f41 1376
cec736d2
LP
1377 r = journal_file_link_entry(f, o, np);
1378 if (r < 0)
1379 return r;
1380
1381 if (ret)
1382 *ret = o;
1383
1384 if (offset)
1385 *offset = np;
1386
1387 return 0;
1388}
1389
cf244689 1390void journal_file_post_change(JournalFile *f) {
50f20cfd
LP
1391 assert(f);
1392
1393 /* inotify() does not receive IN_MODIFY events from file
1394 * accesses done via mmap(). After each access we hence
1395 * trigger IN_MODIFY by truncating the journal file to its
1396 * current size which triggers IN_MODIFY. */
1397
bc85bfee
LP
1398 __sync_synchronize();
1399
50f20cfd 1400 if (ftruncate(f->fd, f->last_stat.st_size) < 0)
56f64d95 1401 log_error_errno(errno, "Failed to truncate file to its own size: %m");
50f20cfd
LP
1402}
1403
1f2da9ec
LP
1404static int entry_item_cmp(const void *_a, const void *_b) {
1405 const EntryItem *a = _a, *b = _b;
1406
1407 if (le64toh(a->object_offset) < le64toh(b->object_offset))
1408 return -1;
1409 if (le64toh(a->object_offset) > le64toh(b->object_offset))
1410 return 1;
1411 return 0;
1412}
1413
de190aef 1414int journal_file_append_entry(JournalFile *f, const dual_timestamp *ts, const struct iovec iovec[], unsigned n_iovec, uint64_t *seqnum, Object **ret, uint64_t *offset) {
cec736d2
LP
1415 unsigned i;
1416 EntryItem *items;
1417 int r;
1418 uint64_t xor_hash = 0;
de190aef 1419 struct dual_timestamp _ts;
cec736d2
LP
1420
1421 assert(f);
1422 assert(iovec || n_iovec == 0);
1423
de190aef
LP
1424 if (!ts) {
1425 dual_timestamp_get(&_ts);
1426 ts = &_ts;
1427 }
1428
1429 if (f->tail_entry_monotonic_valid &&
1430 ts->monotonic < le64toh(f->header->tail_entry_monotonic))
1431 return -EINVAL;
1432
feb12d3e 1433#ifdef HAVE_GCRYPT
7560fffc
LP
1434 r = journal_file_maybe_append_tag(f, ts->realtime);
1435 if (r < 0)
1436 return r;
feb12d3e 1437#endif
7560fffc 1438
64825d3c 1439 /* alloca() can't take 0, hence let's allocate at least one */
9607d947 1440 items = alloca(sizeof(EntryItem) * MAX(1u, n_iovec));
cec736d2
LP
1441
1442 for (i = 0; i < n_iovec; i++) {
1443 uint64_t p;
1444 Object *o;
1445
1446 r = journal_file_append_data(f, iovec[i].iov_base, iovec[i].iov_len, &o, &p);
1447 if (r < 0)
cf244689 1448 return r;
cec736d2
LP
1449
1450 xor_hash ^= le64toh(o->data.hash);
1451 items[i].object_offset = htole64(p);
de7b95cd 1452 items[i].hash = o->data.hash;
cec736d2
LP
1453 }
1454
1f2da9ec
LP
1455 /* Order by the position on disk, in order to improve seek
1456 * times for rotating media. */
7ff7394d 1457 qsort_safe(items, n_iovec, sizeof(EntryItem), entry_item_cmp);
1f2da9ec 1458
de190aef 1459 r = journal_file_append_entry_internal(f, ts, xor_hash, items, n_iovec, seqnum, ret, offset);
cec736d2 1460
fa6ac760
LP
1461 /* If the memory mapping triggered a SIGBUS then we return an
1462 * IO error and ignore the error code passed down to us, since
1463 * it is very likely just an effect of a nullified replacement
1464 * mapping page */
1465
1466 if (mmap_cache_got_sigbus(f->mmap, f->fd))
1467 r = -EIO;
1468
50f20cfd
LP
1469 journal_file_post_change(f);
1470
cec736d2
LP
1471 return r;
1472}
1473
a4bcff5b 1474typedef struct ChainCacheItem {
fb099c8d 1475 uint64_t first; /* the array at the beginning of the chain */
a4bcff5b
LP
1476 uint64_t array; /* the cached array */
1477 uint64_t begin; /* the first item in the cached array */
1478 uint64_t total; /* the total number of items in all arrays before this one in the chain */
f268980d 1479 uint64_t last_index; /* the last index we looked at, to optimize locality when bisecting */
a4bcff5b
LP
1480} ChainCacheItem;
1481
1482static void chain_cache_put(
4743015d 1483 OrderedHashmap *h,
a4bcff5b
LP
1484 ChainCacheItem *ci,
1485 uint64_t first,
1486 uint64_t array,
1487 uint64_t begin,
f268980d
LP
1488 uint64_t total,
1489 uint64_t last_index) {
a4bcff5b
LP
1490
1491 if (!ci) {
34741aa3
LP
1492 /* If the chain item to cache for this chain is the
1493 * first one it's not worth caching anything */
1494 if (array == first)
1495 return;
1496
29433089 1497 if (ordered_hashmap_size(h) >= CHAIN_CACHE_MAX) {
4743015d 1498 ci = ordered_hashmap_steal_first(h);
29433089
LP
1499 assert(ci);
1500 } else {
a4bcff5b
LP
1501 ci = new(ChainCacheItem, 1);
1502 if (!ci)
1503 return;
1504 }
1505
1506 ci->first = first;
1507
4743015d 1508 if (ordered_hashmap_put(h, &ci->first, ci) < 0) {
a4bcff5b
LP
1509 free(ci);
1510 return;
1511 }
1512 } else
1513 assert(ci->first == first);
1514
1515 ci->array = array;
1516 ci->begin = begin;
1517 ci->total = total;
f268980d 1518 ci->last_index = last_index;
a4bcff5b
LP
1519}
1520
f268980d
LP
1521static int generic_array_get(
1522 JournalFile *f,
1523 uint64_t first,
1524 uint64_t i,
1525 Object **ret, uint64_t *offset) {
de190aef 1526
cec736d2 1527 Object *o;
a4bcff5b 1528 uint64_t p = 0, a, t = 0;
cec736d2 1529 int r;
a4bcff5b 1530 ChainCacheItem *ci;
cec736d2
LP
1531
1532 assert(f);
1533
de190aef 1534 a = first;
a4bcff5b
LP
1535
1536 /* Try the chain cache first */
4743015d 1537 ci = ordered_hashmap_get(f->chain_cache, &first);
a4bcff5b
LP
1538 if (ci && i > ci->total) {
1539 a = ci->array;
1540 i -= ci->total;
1541 t = ci->total;
1542 }
1543
de190aef 1544 while (a > 0) {
a4bcff5b 1545 uint64_t k;
cec736d2 1546
de190aef
LP
1547 r = journal_file_move_to_object(f, OBJECT_ENTRY_ARRAY, a, &o);
1548 if (r < 0)
1549 return r;
cec736d2 1550
a4bcff5b
LP
1551 k = journal_file_entry_array_n_items(o);
1552 if (i < k) {
de190aef 1553 p = le64toh(o->entry_array.items[i]);
a4bcff5b 1554 goto found;
cec736d2
LP
1555 }
1556
a4bcff5b
LP
1557 i -= k;
1558 t += k;
de190aef
LP
1559 a = le64toh(o->entry_array.next_entry_array_offset);
1560 }
1561
a4bcff5b
LP
1562 return 0;
1563
1564found:
1565 /* Let's cache this item for the next invocation */
af13a6b0 1566 chain_cache_put(f->chain_cache, ci, first, a, le64toh(o->entry_array.items[0]), t, i);
de190aef
LP
1567
1568 r = journal_file_move_to_object(f, OBJECT_ENTRY, p, &o);
1569 if (r < 0)
1570 return r;
1571
1572 if (ret)
1573 *ret = o;
1574
1575 if (offset)
1576 *offset = p;
1577
1578 return 1;
1579}
1580
f268980d
LP
1581static int generic_array_get_plus_one(
1582 JournalFile *f,
1583 uint64_t extra,
1584 uint64_t first,
1585 uint64_t i,
1586 Object **ret, uint64_t *offset) {
de190aef
LP
1587
1588 Object *o;
1589
1590 assert(f);
1591
1592 if (i == 0) {
1593 int r;
1594
1595 r = journal_file_move_to_object(f, OBJECT_ENTRY, extra, &o);
cec736d2
LP
1596 if (r < 0)
1597 return r;
1598
de190aef
LP
1599 if (ret)
1600 *ret = o;
cec736d2 1601
de190aef
LP
1602 if (offset)
1603 *offset = extra;
cec736d2 1604
de190aef 1605 return 1;
cec736d2
LP
1606 }
1607
de190aef
LP
1608 return generic_array_get(f, first, i-1, ret, offset);
1609}
cec736d2 1610
de190aef
LP
1611enum {
1612 TEST_FOUND,
1613 TEST_LEFT,
1614 TEST_RIGHT
1615};
cec736d2 1616
f268980d
LP
1617static int generic_array_bisect(
1618 JournalFile *f,
1619 uint64_t first,
1620 uint64_t n,
1621 uint64_t needle,
1622 int (*test_object)(JournalFile *f, uint64_t p, uint64_t needle),
1623 direction_t direction,
1624 Object **ret,
1625 uint64_t *offset,
1626 uint64_t *idx) {
1627
1628 uint64_t a, p, t = 0, i = 0, last_p = 0, last_index = (uint64_t) -1;
de190aef
LP
1629 bool subtract_one = false;
1630 Object *o, *array = NULL;
1631 int r;
a4bcff5b 1632 ChainCacheItem *ci;
cec736d2 1633
de190aef
LP
1634 assert(f);
1635 assert(test_object);
cec736d2 1636
a4bcff5b 1637 /* Start with the first array in the chain */
de190aef 1638 a = first;
a4bcff5b 1639
4743015d 1640 ci = ordered_hashmap_get(f->chain_cache, &first);
a4bcff5b
LP
1641 if (ci && n > ci->total) {
1642 /* Ah, we have iterated this bisection array chain
1643 * previously! Let's see if we can skip ahead in the
1644 * chain, as far as the last time. But we can't jump
1645 * backwards in the chain, so let's check that
1646 * first. */
1647
1648 r = test_object(f, ci->begin, needle);
1649 if (r < 0)
1650 return r;
1651
1652 if (r == TEST_LEFT) {
f268980d 1653 /* OK, what we are looking for is right of the
a4bcff5b
LP
1654 * begin of this EntryArray, so let's jump
1655 * straight to previously cached array in the
1656 * chain */
1657
1658 a = ci->array;
1659 n -= ci->total;
1660 t = ci->total;
f268980d 1661 last_index = ci->last_index;
a4bcff5b
LP
1662 }
1663 }
1664
de190aef
LP
1665 while (a > 0) {
1666 uint64_t left, right, k, lp;
1667
1668 r = journal_file_move_to_object(f, OBJECT_ENTRY_ARRAY, a, &array);
cec736d2
LP
1669 if (r < 0)
1670 return r;
1671
de190aef
LP
1672 k = journal_file_entry_array_n_items(array);
1673 right = MIN(k, n);
1674 if (right <= 0)
1675 return 0;
cec736d2 1676
de190aef
LP
1677 i = right - 1;
1678 lp = p = le64toh(array->entry_array.items[i]);
1679 if (p <= 0)
1680 return -EBADMSG;
cec736d2 1681
de190aef
LP
1682 r = test_object(f, p, needle);
1683 if (r < 0)
1684 return r;
cec736d2 1685
de190aef
LP
1686 if (r == TEST_FOUND)
1687 r = direction == DIRECTION_DOWN ? TEST_RIGHT : TEST_LEFT;
1688
1689 if (r == TEST_RIGHT) {
1690 left = 0;
1691 right -= 1;
f268980d
LP
1692
1693 if (last_index != (uint64_t) -1) {
1694 assert(last_index <= right);
1695
1696 /* If we cached the last index we
1697 * looked at, let's try to not to jump
1698 * too wildly around and see if we can
1699 * limit the range to look at early to
1700 * the immediate neighbors of the last
1701 * index we looked at. */
1702
1703 if (last_index > 0) {
1704 uint64_t x = last_index - 1;
1705
1706 p = le64toh(array->entry_array.items[x]);
1707 if (p <= 0)
1708 return -EBADMSG;
1709
1710 r = test_object(f, p, needle);
1711 if (r < 0)
1712 return r;
1713
1714 if (r == TEST_FOUND)
1715 r = direction == DIRECTION_DOWN ? TEST_RIGHT : TEST_LEFT;
1716
1717 if (r == TEST_RIGHT)
1718 right = x;
1719 else
1720 left = x + 1;
1721 }
1722
1723 if (last_index < right) {
1724 uint64_t y = last_index + 1;
1725
1726 p = le64toh(array->entry_array.items[y]);
1727 if (p <= 0)
1728 return -EBADMSG;
1729
1730 r = test_object(f, p, needle);
1731 if (r < 0)
1732 return r;
1733
1734 if (r == TEST_FOUND)
1735 r = direction == DIRECTION_DOWN ? TEST_RIGHT : TEST_LEFT;
1736
1737 if (r == TEST_RIGHT)
1738 right = y;
1739 else
1740 left = y + 1;
1741 }
f268980d
LP
1742 }
1743
de190aef
LP
1744 for (;;) {
1745 if (left == right) {
1746 if (direction == DIRECTION_UP)
1747 subtract_one = true;
1748
1749 i = left;
1750 goto found;
1751 }
1752
1753 assert(left < right);
de190aef 1754 i = (left + right) / 2;
f268980d 1755
de190aef
LP
1756 p = le64toh(array->entry_array.items[i]);
1757 if (p <= 0)
1758 return -EBADMSG;
1759
1760 r = test_object(f, p, needle);
1761 if (r < 0)
1762 return r;
cec736d2 1763
de190aef
LP
1764 if (r == TEST_FOUND)
1765 r = direction == DIRECTION_DOWN ? TEST_RIGHT : TEST_LEFT;
1766
1767 if (r == TEST_RIGHT)
1768 right = i;
1769 else
1770 left = i + 1;
1771 }
1772 }
1773
2173cbf8 1774 if (k >= n) {
cbdca852
LP
1775 if (direction == DIRECTION_UP) {
1776 i = n;
1777 subtract_one = true;
1778 goto found;
1779 }
1780
cec736d2 1781 return 0;
cbdca852 1782 }
cec736d2 1783
de190aef
LP
1784 last_p = lp;
1785
1786 n -= k;
1787 t += k;
f268980d 1788 last_index = (uint64_t) -1;
de190aef 1789 a = le64toh(array->entry_array.next_entry_array_offset);
cec736d2
LP
1790 }
1791
1792 return 0;
de190aef
LP
1793
1794found:
1795 if (subtract_one && t == 0 && i == 0)
1796 return 0;
1797
a4bcff5b 1798 /* Let's cache this item for the next invocation */
af13a6b0 1799 chain_cache_put(f->chain_cache, ci, first, a, le64toh(array->entry_array.items[0]), t, subtract_one ? (i > 0 ? i-1 : (uint64_t) -1) : i);
a4bcff5b 1800
de190aef
LP
1801 if (subtract_one && i == 0)
1802 p = last_p;
1803 else if (subtract_one)
1804 p = le64toh(array->entry_array.items[i-1]);
1805 else
1806 p = le64toh(array->entry_array.items[i]);
1807
1808 r = journal_file_move_to_object(f, OBJECT_ENTRY, p, &o);
1809 if (r < 0)
1810 return r;
1811
1812 if (ret)
1813 *ret = o;
1814
1815 if (offset)
1816 *offset = p;
1817
1818 if (idx)
cbdca852 1819 *idx = t + i + (subtract_one ? -1 : 0);
de190aef
LP
1820
1821 return 1;
cec736d2
LP
1822}
1823
f268980d
LP
1824static int generic_array_bisect_plus_one(
1825 JournalFile *f,
1826 uint64_t extra,
1827 uint64_t first,
1828 uint64_t n,
1829 uint64_t needle,
1830 int (*test_object)(JournalFile *f, uint64_t p, uint64_t needle),
1831 direction_t direction,
1832 Object **ret,
1833 uint64_t *offset,
1834 uint64_t *idx) {
de190aef 1835
cec736d2 1836 int r;
cbdca852
LP
1837 bool step_back = false;
1838 Object *o;
cec736d2
LP
1839
1840 assert(f);
de190aef 1841 assert(test_object);
cec736d2 1842
de190aef
LP
1843 if (n <= 0)
1844 return 0;
cec736d2 1845
de190aef
LP
1846 /* This bisects the array in object 'first', but first checks
1847 * an extra */
de190aef
LP
1848 r = test_object(f, extra, needle);
1849 if (r < 0)
1850 return r;
a536e261
LP
1851
1852 if (r == TEST_FOUND)
1853 r = direction == DIRECTION_DOWN ? TEST_RIGHT : TEST_LEFT;
1854
cbdca852
LP
1855 /* if we are looking with DIRECTION_UP then we need to first
1856 see if in the actual array there is a matching entry, and
1857 return the last one of that. But if there isn't any we need
1858 to return this one. Hence remember this, and return it
1859 below. */
1860 if (r == TEST_LEFT)
1861 step_back = direction == DIRECTION_UP;
de190aef 1862
cbdca852
LP
1863 if (r == TEST_RIGHT) {
1864 if (direction == DIRECTION_DOWN)
1865 goto found;
1866 else
1867 return 0;
a536e261 1868 }
cec736d2 1869
de190aef
LP
1870 r = generic_array_bisect(f, first, n-1, needle, test_object, direction, ret, offset, idx);
1871
cbdca852
LP
1872 if (r == 0 && step_back)
1873 goto found;
1874
ecf68b1d 1875 if (r > 0 && idx)
de190aef
LP
1876 (*idx) ++;
1877
1878 return r;
cbdca852
LP
1879
1880found:
1881 r = journal_file_move_to_object(f, OBJECT_ENTRY, extra, &o);
1882 if (r < 0)
1883 return r;
1884
1885 if (ret)
1886 *ret = o;
1887
1888 if (offset)
1889 *offset = extra;
1890
1891 if (idx)
1892 *idx = 0;
1893
1894 return 1;
1895}
1896
44a6b1b6 1897_pure_ static int test_object_offset(JournalFile *f, uint64_t p, uint64_t needle) {
cbdca852
LP
1898 assert(f);
1899 assert(p > 0);
1900
1901 if (p == needle)
1902 return TEST_FOUND;
1903 else if (p < needle)
1904 return TEST_LEFT;
1905 else
1906 return TEST_RIGHT;
1907}
1908
de190aef
LP
1909static int test_object_seqnum(JournalFile *f, uint64_t p, uint64_t needle) {
1910 Object *o;
1911 int r;
1912
1913 assert(f);
1914 assert(p > 0);
1915
1916 r = journal_file_move_to_object(f, OBJECT_ENTRY, p, &o);
cec736d2
LP
1917 if (r < 0)
1918 return r;
1919
de190aef
LP
1920 if (le64toh(o->entry.seqnum) == needle)
1921 return TEST_FOUND;
1922 else if (le64toh(o->entry.seqnum) < needle)
1923 return TEST_LEFT;
1924 else
1925 return TEST_RIGHT;
1926}
cec736d2 1927
de190aef
LP
1928int journal_file_move_to_entry_by_seqnum(
1929 JournalFile *f,
1930 uint64_t seqnum,
1931 direction_t direction,
1932 Object **ret,
1933 uint64_t *offset) {
1934
1935 return generic_array_bisect(f,
1936 le64toh(f->header->entry_array_offset),
1937 le64toh(f->header->n_entries),
1938 seqnum,
1939 test_object_seqnum,
1940 direction,
1941 ret, offset, NULL);
1942}
cec736d2 1943
de190aef
LP
1944static int test_object_realtime(JournalFile *f, uint64_t p, uint64_t needle) {
1945 Object *o;
1946 int r;
1947
1948 assert(f);
1949 assert(p > 0);
1950
1951 r = journal_file_move_to_object(f, OBJECT_ENTRY, p, &o);
1952 if (r < 0)
1953 return r;
1954
1955 if (le64toh(o->entry.realtime) == needle)
1956 return TEST_FOUND;
1957 else if (le64toh(o->entry.realtime) < needle)
1958 return TEST_LEFT;
1959 else
1960 return TEST_RIGHT;
cec736d2
LP
1961}
1962
de190aef
LP
1963int journal_file_move_to_entry_by_realtime(
1964 JournalFile *f,
1965 uint64_t realtime,
1966 direction_t direction,
1967 Object **ret,
1968 uint64_t *offset) {
1969
1970 return generic_array_bisect(f,
1971 le64toh(f->header->entry_array_offset),
1972 le64toh(f->header->n_entries),
1973 realtime,
1974 test_object_realtime,
1975 direction,
1976 ret, offset, NULL);
1977}
1978
1979static int test_object_monotonic(JournalFile *f, uint64_t p, uint64_t needle) {
1980 Object *o;
1981 int r;
1982
1983 assert(f);
1984 assert(p > 0);
1985
1986 r = journal_file_move_to_object(f, OBJECT_ENTRY, p, &o);
1987 if (r < 0)
1988 return r;
1989
1990 if (le64toh(o->entry.monotonic) == needle)
1991 return TEST_FOUND;
1992 else if (le64toh(o->entry.monotonic) < needle)
1993 return TEST_LEFT;
1994 else
1995 return TEST_RIGHT;
1996}
1997
2a560338 1998static int find_data_object_by_boot_id(
47838ab3
ZJS
1999 JournalFile *f,
2000 sd_id128_t boot_id,
2001 Object **o,
2002 uint64_t *b) {
2a560338 2003
47838ab3
ZJS
2004 char t[sizeof("_BOOT_ID=")-1 + 32 + 1] = "_BOOT_ID=";
2005
2006 sd_id128_to_string(boot_id, t + 9);
2007 return journal_file_find_data_object(f, t, sizeof(t) - 1, o, b);
2008}
2009
de190aef
LP
2010int journal_file_move_to_entry_by_monotonic(
2011 JournalFile *f,
2012 sd_id128_t boot_id,
2013 uint64_t monotonic,
2014 direction_t direction,
2015 Object **ret,
2016 uint64_t *offset) {
2017
de190aef
LP
2018 Object *o;
2019 int r;
2020
cbdca852 2021 assert(f);
de190aef 2022
47838ab3 2023 r = find_data_object_by_boot_id(f, boot_id, &o, NULL);
de190aef
LP
2024 if (r < 0)
2025 return r;
cbdca852 2026 if (r == 0)
de190aef
LP
2027 return -ENOENT;
2028
2029 return generic_array_bisect_plus_one(f,
2030 le64toh(o->data.entry_offset),
2031 le64toh(o->data.entry_array_offset),
2032 le64toh(o->data.n_entries),
2033 monotonic,
2034 test_object_monotonic,
2035 direction,
2036 ret, offset, NULL);
2037}
2038
1fc605b0 2039void journal_file_reset_location(JournalFile *f) {
6573ef05 2040 f->location_type = LOCATION_HEAD;
1fc605b0 2041 f->current_offset = 0;
6573ef05
MS
2042 f->current_seqnum = 0;
2043 f->current_realtime = 0;
2044 f->current_monotonic = 0;
2045 zero(f->current_boot_id);
2046 f->current_xor_hash = 0;
2047}
2048
950c07d4 2049void journal_file_save_location(JournalFile *f, Object *o, uint64_t offset) {
6573ef05
MS
2050 f->location_type = LOCATION_SEEK;
2051 f->current_offset = offset;
2052 f->current_seqnum = le64toh(o->entry.seqnum);
2053 f->current_realtime = le64toh(o->entry.realtime);
2054 f->current_monotonic = le64toh(o->entry.monotonic);
2055 f->current_boot_id = o->entry.boot_id;
2056 f->current_xor_hash = le64toh(o->entry.xor_hash);
1fc605b0
MS
2057}
2058
d8ae66d7
MS
2059int journal_file_compare_locations(JournalFile *af, JournalFile *bf) {
2060 assert(af);
2061 assert(bf);
2062 assert(af->location_type == LOCATION_SEEK);
2063 assert(bf->location_type == LOCATION_SEEK);
2064
2065 /* If contents and timestamps match, these entries are
2066 * identical, even if the seqnum does not match */
2067 if (sd_id128_equal(af->current_boot_id, bf->current_boot_id) &&
2068 af->current_monotonic == bf->current_monotonic &&
2069 af->current_realtime == bf->current_realtime &&
2070 af->current_xor_hash == bf->current_xor_hash)
2071 return 0;
2072
2073 if (sd_id128_equal(af->header->seqnum_id, bf->header->seqnum_id)) {
2074
2075 /* If this is from the same seqnum source, compare
2076 * seqnums */
2077 if (af->current_seqnum < bf->current_seqnum)
2078 return -1;
2079 if (af->current_seqnum > bf->current_seqnum)
2080 return 1;
2081
2082 /* Wow! This is weird, different data but the same
2083 * seqnums? Something is borked, but let's make the
2084 * best of it and compare by time. */
2085 }
2086
2087 if (sd_id128_equal(af->current_boot_id, bf->current_boot_id)) {
2088
2089 /* If the boot id matches, compare monotonic time */
2090 if (af->current_monotonic < bf->current_monotonic)
2091 return -1;
2092 if (af->current_monotonic > bf->current_monotonic)
2093 return 1;
2094 }
2095
2096 /* Otherwise, compare UTC time */
2097 if (af->current_realtime < bf->current_realtime)
2098 return -1;
2099 if (af->current_realtime > bf->current_realtime)
2100 return 1;
2101
2102 /* Finally, compare by contents */
2103 if (af->current_xor_hash < bf->current_xor_hash)
2104 return -1;
2105 if (af->current_xor_hash > bf->current_xor_hash)
2106 return 1;
2107
2108 return 0;
2109}
2110
de190aef
LP
2111int journal_file_next_entry(
2112 JournalFile *f,
f534928a 2113 uint64_t p,
de190aef
LP
2114 direction_t direction,
2115 Object **ret, uint64_t *offset) {
2116
fb099c8d 2117 uint64_t i, n, ofs;
cec736d2
LP
2118 int r;
2119
2120 assert(f);
de190aef
LP
2121
2122 n = le64toh(f->header->n_entries);
2123 if (n <= 0)
2124 return 0;
cec736d2 2125
f534928a 2126 if (p == 0)
de190aef 2127 i = direction == DIRECTION_DOWN ? 0 : n - 1;
cec736d2 2128 else {
de190aef
LP
2129 r = generic_array_bisect(f,
2130 le64toh(f->header->entry_array_offset),
2131 le64toh(f->header->n_entries),
2132 p,
2133 test_object_offset,
2134 DIRECTION_DOWN,
2135 NULL, NULL,
2136 &i);
2137 if (r <= 0)
2138 return r;
2139
2140 if (direction == DIRECTION_DOWN) {
2141 if (i >= n - 1)
2142 return 0;
2143
2144 i++;
2145 } else {
2146 if (i <= 0)
2147 return 0;
2148
2149 i--;
2150 }
cec736d2
LP
2151 }
2152
de190aef 2153 /* And jump to it */
fb099c8d
ZJS
2154 r = generic_array_get(f,
2155 le64toh(f->header->entry_array_offset),
2156 i,
2157 ret, &ofs);
2158 if (r <= 0)
2159 return r;
2160
2161 if (p > 0 &&
2162 (direction == DIRECTION_DOWN ? ofs <= p : ofs >= p)) {
2163 log_debug("%s: entry array corrupted at entry %"PRIu64,
2164 f->path, i);
2165 return -EBADMSG;
2166 }
2167
2168 if (offset)
2169 *offset = ofs;
2170
2171 return 1;
de190aef 2172}
cec736d2 2173
de190aef
LP
2174int journal_file_next_entry_for_data(
2175 JournalFile *f,
2176 Object *o, uint64_t p,
2177 uint64_t data_offset,
2178 direction_t direction,
2179 Object **ret, uint64_t *offset) {
2180
2181 uint64_t n, i;
cec736d2 2182 int r;
de190aef 2183 Object *d;
cec736d2
LP
2184
2185 assert(f);
de190aef 2186 assert(p > 0 || !o);
cec736d2 2187
de190aef 2188 r = journal_file_move_to_object(f, OBJECT_DATA, data_offset, &d);
466ccd92 2189 if (r < 0)
de190aef 2190 return r;
cec736d2 2191
de190aef
LP
2192 n = le64toh(d->data.n_entries);
2193 if (n <= 0)
2194 return n;
cec736d2 2195
de190aef
LP
2196 if (!o)
2197 i = direction == DIRECTION_DOWN ? 0 : n - 1;
2198 else {
2199 if (o->object.type != OBJECT_ENTRY)
2200 return -EINVAL;
cec736d2 2201
de190aef
LP
2202 r = generic_array_bisect_plus_one(f,
2203 le64toh(d->data.entry_offset),
2204 le64toh(d->data.entry_array_offset),
2205 le64toh(d->data.n_entries),
2206 p,
2207 test_object_offset,
2208 DIRECTION_DOWN,
2209 NULL, NULL,
2210 &i);
2211
2212 if (r <= 0)
cec736d2
LP
2213 return r;
2214
de190aef
LP
2215 if (direction == DIRECTION_DOWN) {
2216 if (i >= n - 1)
2217 return 0;
cec736d2 2218
de190aef
LP
2219 i++;
2220 } else {
2221 if (i <= 0)
2222 return 0;
cec736d2 2223
de190aef
LP
2224 i--;
2225 }
cec736d2 2226
de190aef 2227 }
cec736d2 2228
de190aef
LP
2229 return generic_array_get_plus_one(f,
2230 le64toh(d->data.entry_offset),
2231 le64toh(d->data.entry_array_offset),
2232 i,
2233 ret, offset);
2234}
cec736d2 2235
cbdca852
LP
2236int journal_file_move_to_entry_by_offset_for_data(
2237 JournalFile *f,
2238 uint64_t data_offset,
2239 uint64_t p,
2240 direction_t direction,
2241 Object **ret, uint64_t *offset) {
2242
2243 int r;
2244 Object *d;
2245
2246 assert(f);
2247
2248 r = journal_file_move_to_object(f, OBJECT_DATA, data_offset, &d);
2249 if (r < 0)
2250 return r;
2251
2252 return generic_array_bisect_plus_one(f,
2253 le64toh(d->data.entry_offset),
2254 le64toh(d->data.entry_array_offset),
2255 le64toh(d->data.n_entries),
2256 p,
2257 test_object_offset,
2258 direction,
2259 ret, offset, NULL);
2260}
2261
2262int journal_file_move_to_entry_by_monotonic_for_data(
2263 JournalFile *f,
2264 uint64_t data_offset,
2265 sd_id128_t boot_id,
2266 uint64_t monotonic,
2267 direction_t direction,
2268 Object **ret, uint64_t *offset) {
2269
cbdca852
LP
2270 Object *o, *d;
2271 int r;
2272 uint64_t b, z;
2273
2274 assert(f);
2275
2276 /* First, seek by time */
47838ab3 2277 r = find_data_object_by_boot_id(f, boot_id, &o, &b);
cbdca852
LP
2278 if (r < 0)
2279 return r;
2280 if (r == 0)
2281 return -ENOENT;
2282
2283 r = generic_array_bisect_plus_one(f,
2284 le64toh(o->data.entry_offset),
2285 le64toh(o->data.entry_array_offset),
2286 le64toh(o->data.n_entries),
2287 monotonic,
2288 test_object_monotonic,
2289 direction,
2290 NULL, &z, NULL);
2291 if (r <= 0)
2292 return r;
2293
2294 /* And now, continue seeking until we find an entry that
2295 * exists in both bisection arrays */
2296
2297 for (;;) {
2298 Object *qo;
2299 uint64_t p, q;
2300
2301 r = journal_file_move_to_object(f, OBJECT_DATA, data_offset, &d);
2302 if (r < 0)
2303 return r;
2304
2305 r = generic_array_bisect_plus_one(f,
2306 le64toh(d->data.entry_offset),
2307 le64toh(d->data.entry_array_offset),
2308 le64toh(d->data.n_entries),
2309 z,
2310 test_object_offset,
2311 direction,
2312 NULL, &p, NULL);
2313 if (r <= 0)
2314 return r;
2315
2316 r = journal_file_move_to_object(f, OBJECT_DATA, b, &o);
2317 if (r < 0)
2318 return r;
2319
2320 r = generic_array_bisect_plus_one(f,
2321 le64toh(o->data.entry_offset),
2322 le64toh(o->data.entry_array_offset),
2323 le64toh(o->data.n_entries),
2324 p,
2325 test_object_offset,
2326 direction,
2327 &qo, &q, NULL);
2328
2329 if (r <= 0)
2330 return r;
2331
2332 if (p == q) {
2333 if (ret)
2334 *ret = qo;
2335 if (offset)
2336 *offset = q;
2337
2338 return 1;
2339 }
2340
2341 z = q;
2342 }
cbdca852
LP
2343}
2344
de190aef
LP
2345int journal_file_move_to_entry_by_seqnum_for_data(
2346 JournalFile *f,
2347 uint64_t data_offset,
2348 uint64_t seqnum,
2349 direction_t direction,
2350 Object **ret, uint64_t *offset) {
cec736d2 2351
de190aef
LP
2352 Object *d;
2353 int r;
cec736d2 2354
91a31dde
LP
2355 assert(f);
2356
de190aef 2357 r = journal_file_move_to_object(f, OBJECT_DATA, data_offset, &d);
91a31dde 2358 if (r < 0)
de190aef 2359 return r;
cec736d2 2360
de190aef
LP
2361 return generic_array_bisect_plus_one(f,
2362 le64toh(d->data.entry_offset),
2363 le64toh(d->data.entry_array_offset),
2364 le64toh(d->data.n_entries),
2365 seqnum,
2366 test_object_seqnum,
2367 direction,
2368 ret, offset, NULL);
2369}
cec736d2 2370
de190aef
LP
2371int journal_file_move_to_entry_by_realtime_for_data(
2372 JournalFile *f,
2373 uint64_t data_offset,
2374 uint64_t realtime,
2375 direction_t direction,
2376 Object **ret, uint64_t *offset) {
2377
2378 Object *d;
2379 int r;
2380
91a31dde
LP
2381 assert(f);
2382
de190aef 2383 r = journal_file_move_to_object(f, OBJECT_DATA, data_offset, &d);
91a31dde 2384 if (r < 0)
de190aef
LP
2385 return r;
2386
2387 return generic_array_bisect_plus_one(f,
2388 le64toh(d->data.entry_offset),
2389 le64toh(d->data.entry_array_offset),
2390 le64toh(d->data.n_entries),
2391 realtime,
2392 test_object_realtime,
2393 direction,
2394 ret, offset, NULL);
cec736d2
LP
2395}
2396
0284adc6 2397void journal_file_dump(JournalFile *f) {
7560fffc 2398 Object *o;
7560fffc 2399 int r;
0284adc6 2400 uint64_t p;
7560fffc
LP
2401
2402 assert(f);
2403
0284adc6 2404 journal_file_print_header(f);
7560fffc 2405
0284adc6
LP
2406 p = le64toh(f->header->header_size);
2407 while (p != 0) {
d05089d8 2408 r = journal_file_move_to_object(f, OBJECT_UNUSED, p, &o);
0284adc6
LP
2409 if (r < 0)
2410 goto fail;
7560fffc 2411
0284adc6 2412 switch (o->object.type) {
d98cc1f2 2413
0284adc6
LP
2414 case OBJECT_UNUSED:
2415 printf("Type: OBJECT_UNUSED\n");
2416 break;
d98cc1f2 2417
0284adc6
LP
2418 case OBJECT_DATA:
2419 printf("Type: OBJECT_DATA\n");
2420 break;
7560fffc 2421
3c1668da
LP
2422 case OBJECT_FIELD:
2423 printf("Type: OBJECT_FIELD\n");
2424 break;
2425
0284adc6 2426 case OBJECT_ENTRY:
507f22bd
ZJS
2427 printf("Type: OBJECT_ENTRY seqnum=%"PRIu64" monotonic=%"PRIu64" realtime=%"PRIu64"\n",
2428 le64toh(o->entry.seqnum),
2429 le64toh(o->entry.monotonic),
2430 le64toh(o->entry.realtime));
0284adc6 2431 break;
7560fffc 2432
0284adc6
LP
2433 case OBJECT_FIELD_HASH_TABLE:
2434 printf("Type: OBJECT_FIELD_HASH_TABLE\n");
2435 break;
7560fffc 2436
0284adc6
LP
2437 case OBJECT_DATA_HASH_TABLE:
2438 printf("Type: OBJECT_DATA_HASH_TABLE\n");
2439 break;
7560fffc 2440
0284adc6
LP
2441 case OBJECT_ENTRY_ARRAY:
2442 printf("Type: OBJECT_ENTRY_ARRAY\n");
2443 break;
7560fffc 2444
0284adc6 2445 case OBJECT_TAG:
507f22bd
ZJS
2446 printf("Type: OBJECT_TAG seqnum=%"PRIu64" epoch=%"PRIu64"\n",
2447 le64toh(o->tag.seqnum),
2448 le64toh(o->tag.epoch));
0284adc6 2449 break;
3c1668da
LP
2450
2451 default:
8facc349 2452 printf("Type: unknown (%i)\n", o->object.type);
3c1668da 2453 break;
0284adc6 2454 }
7560fffc 2455
d89c8fdf
ZJS
2456 if (o->object.flags & OBJECT_COMPRESSION_MASK)
2457 printf("Flags: %s\n",
2458 object_compressed_to_string(o->object.flags & OBJECT_COMPRESSION_MASK));
7560fffc 2459
0284adc6
LP
2460 if (p == le64toh(f->header->tail_object_offset))
2461 p = 0;
2462 else
2463 p = p + ALIGN64(le64toh(o->object.size));
2464 }
7560fffc 2465
0284adc6
LP
2466 return;
2467fail:
2468 log_error("File corrupt");
7560fffc
LP
2469}
2470
718fe4b1
ZJS
2471static const char* format_timestamp_safe(char *buf, size_t l, usec_t t) {
2472 const char *x;
2473
2474 x = format_timestamp(buf, l, t);
2475 if (x)
2476 return x;
2477 return " --- ";
2478}
2479
0284adc6 2480void journal_file_print_header(JournalFile *f) {
2765b7bb 2481 char a[33], b[33], c[33], d[33];
ed375beb 2482 char x[FORMAT_TIMESTAMP_MAX], y[FORMAT_TIMESTAMP_MAX], z[FORMAT_TIMESTAMP_MAX];
a1a03e30
LP
2483 struct stat st;
2484 char bytes[FORMAT_BYTES_MAX];
7560fffc
LP
2485
2486 assert(f);
7560fffc 2487
0284adc6
LP
2488 printf("File Path: %s\n"
2489 "File ID: %s\n"
2490 "Machine ID: %s\n"
2491 "Boot ID: %s\n"
2492 "Sequential Number ID: %s\n"
2493 "State: %s\n"
2494 "Compatible Flags:%s%s\n"
d89c8fdf 2495 "Incompatible Flags:%s%s%s\n"
507f22bd
ZJS
2496 "Header size: %"PRIu64"\n"
2497 "Arena size: %"PRIu64"\n"
2498 "Data Hash Table Size: %"PRIu64"\n"
2499 "Field Hash Table Size: %"PRIu64"\n"
0284adc6 2500 "Rotate Suggested: %s\n"
507f22bd
ZJS
2501 "Head Sequential Number: %"PRIu64"\n"
2502 "Tail Sequential Number: %"PRIu64"\n"
0284adc6 2503 "Head Realtime Timestamp: %s\n"
3223f44f 2504 "Tail Realtime Timestamp: %s\n"
ed375beb 2505 "Tail Monotonic Timestamp: %s\n"
507f22bd
ZJS
2506 "Objects: %"PRIu64"\n"
2507 "Entry Objects: %"PRIu64"\n",
0284adc6
LP
2508 f->path,
2509 sd_id128_to_string(f->header->file_id, a),
2510 sd_id128_to_string(f->header->machine_id, b),
2511 sd_id128_to_string(f->header->boot_id, c),
2765b7bb 2512 sd_id128_to_string(f->header->seqnum_id, d),
3223f44f
LP
2513 f->header->state == STATE_OFFLINE ? "OFFLINE" :
2514 f->header->state == STATE_ONLINE ? "ONLINE" :
2515 f->header->state == STATE_ARCHIVED ? "ARCHIVED" : "UNKNOWN",
8088cbd3 2516 JOURNAL_HEADER_SEALED(f->header) ? " SEALED" : "",
d89c8fdf
ZJS
2517 (le32toh(f->header->compatible_flags) & ~HEADER_COMPATIBLE_ANY) ? " ???" : "",
2518 JOURNAL_HEADER_COMPRESSED_XZ(f->header) ? " COMPRESSED-XZ" : "",
2519 JOURNAL_HEADER_COMPRESSED_LZ4(f->header) ? " COMPRESSED-LZ4" : "",
2520 (le32toh(f->header->incompatible_flags) & ~HEADER_INCOMPATIBLE_ANY) ? " ???" : "",
507f22bd
ZJS
2521 le64toh(f->header->header_size),
2522 le64toh(f->header->arena_size),
2523 le64toh(f->header->data_hash_table_size) / sizeof(HashItem),
2524 le64toh(f->header->field_hash_table_size) / sizeof(HashItem),
fb0951b0 2525 yes_no(journal_file_rotate_suggested(f, 0)),
507f22bd
ZJS
2526 le64toh(f->header->head_entry_seqnum),
2527 le64toh(f->header->tail_entry_seqnum),
718fe4b1
ZJS
2528 format_timestamp_safe(x, sizeof(x), le64toh(f->header->head_entry_realtime)),
2529 format_timestamp_safe(y, sizeof(y), le64toh(f->header->tail_entry_realtime)),
ed375beb 2530 format_timespan(z, sizeof(z), le64toh(f->header->tail_entry_monotonic), USEC_PER_MSEC),
507f22bd
ZJS
2531 le64toh(f->header->n_objects),
2532 le64toh(f->header->n_entries));
7560fffc 2533
0284adc6 2534 if (JOURNAL_HEADER_CONTAINS(f->header, n_data))
507f22bd 2535 printf("Data Objects: %"PRIu64"\n"
0284adc6 2536 "Data Hash Table Fill: %.1f%%\n",
507f22bd 2537 le64toh(f->header->n_data),
0284adc6 2538 100.0 * (double) le64toh(f->header->n_data) / ((double) (le64toh(f->header->data_hash_table_size) / sizeof(HashItem))));
7560fffc 2539
0284adc6 2540 if (JOURNAL_HEADER_CONTAINS(f->header, n_fields))
507f22bd 2541 printf("Field Objects: %"PRIu64"\n"
0284adc6 2542 "Field Hash Table Fill: %.1f%%\n",
507f22bd 2543 le64toh(f->header->n_fields),
0284adc6 2544 100.0 * (double) le64toh(f->header->n_fields) / ((double) (le64toh(f->header->field_hash_table_size) / sizeof(HashItem))));
3223f44f
LP
2545
2546 if (JOURNAL_HEADER_CONTAINS(f->header, n_tags))
507f22bd
ZJS
2547 printf("Tag Objects: %"PRIu64"\n",
2548 le64toh(f->header->n_tags));
3223f44f 2549 if (JOURNAL_HEADER_CONTAINS(f->header, n_entry_arrays))
507f22bd
ZJS
2550 printf("Entry Array Objects: %"PRIu64"\n",
2551 le64toh(f->header->n_entry_arrays));
a1a03e30
LP
2552
2553 if (fstat(f->fd, &st) >= 0)
59f448cf 2554 printf("Disk usage: %s\n", format_bytes(bytes, sizeof(bytes), (uint64_t) st.st_blocks * 512ULL));
7560fffc
LP
2555}
2556
fc68c929
LP
2557static int journal_file_warn_btrfs(JournalFile *f) {
2558 unsigned attrs;
2559 int r;
2560
2561 assert(f);
2562
2563 /* Before we write anything, check if the COW logic is turned
2564 * off on btrfs. Given our write pattern that is quite
2565 * unfriendly to COW file systems this should greatly improve
2566 * performance on COW file systems, such as btrfs, at the
2567 * expense of data integrity features (which shouldn't be too
2568 * bad, given that we do our own checksumming). */
2569
2570 r = btrfs_is_filesystem(f->fd);
2571 if (r < 0)
2572 return log_warning_errno(r, "Failed to determine if journal is on btrfs: %m");
2573 if (!r)
2574 return 0;
2575
2576 r = read_attr_fd(f->fd, &attrs);
2577 if (r < 0)
2578 return log_warning_errno(r, "Failed to read file attributes: %m");
2579
2580 if (attrs & FS_NOCOW_FL) {
2581 log_debug("Detected btrfs file system with copy-on-write disabled, all is good.");
2582 return 0;
2583 }
2584
2585 log_notice("Creating journal file %s on a btrfs file system, and copy-on-write is enabled. "
2586 "This is likely to slow down journal access substantially, please consider turning "
2587 "off the copy-on-write file attribute on the journal directory, using chattr +C.", f->path);
2588
2589 return 1;
2590}
2591
0284adc6
LP
2592int journal_file_open(
2593 const char *fname,
2594 int flags,
2595 mode_t mode,
2596 bool compress,
baed47c3 2597 bool seal,
0284adc6
LP
2598 JournalMetrics *metrics,
2599 MMapCache *mmap_cache,
2600 JournalFile *template,
2601 JournalFile **ret) {
7560fffc 2602
fa6ac760 2603 bool newly_created = false;
0284adc6 2604 JournalFile *f;
fa6ac760 2605 void *h;
0284adc6 2606 int r;
7560fffc 2607
0284adc6 2608 assert(fname);
0559d3a5 2609 assert(ret);
7560fffc 2610
0284adc6
LP
2611 if ((flags & O_ACCMODE) != O_RDONLY &&
2612 (flags & O_ACCMODE) != O_RDWR)
2613 return -EINVAL;
7560fffc 2614
a0108012
LP
2615 if (!endswith(fname, ".journal") &&
2616 !endswith(fname, ".journal~"))
0284adc6 2617 return -EINVAL;
7560fffc 2618
0284adc6
LP
2619 f = new0(JournalFile, 1);
2620 if (!f)
2621 return -ENOMEM;
7560fffc 2622
0284adc6
LP
2623 f->fd = -1;
2624 f->mode = mode;
7560fffc 2625
0284adc6
LP
2626 f->flags = flags;
2627 f->prot = prot_from_flags(flags);
2628 f->writable = (flags & O_ACCMODE) != O_RDONLY;
92261977 2629#if defined(HAVE_LZ4)
d89c8fdf
ZJS
2630 f->compress_lz4 = compress;
2631#elif defined(HAVE_XZ)
2632 f->compress_xz = compress;
48b61739 2633#endif
49a32d43 2634#ifdef HAVE_GCRYPT
baed47c3 2635 f->seal = seal;
49a32d43 2636#endif
7560fffc 2637
0284adc6
LP
2638 if (mmap_cache)
2639 f->mmap = mmap_cache_ref(mmap_cache);
2640 else {
84168d80 2641 f->mmap = mmap_cache_new();
0284adc6
LP
2642 if (!f->mmap) {
2643 r = -ENOMEM;
2644 goto fail;
2645 }
2646 }
7560fffc 2647
0284adc6
LP
2648 f->path = strdup(fname);
2649 if (!f->path) {
2650 r = -ENOMEM;
2651 goto fail;
2652 }
7560fffc 2653
4743015d 2654 f->chain_cache = ordered_hashmap_new(&uint64_hash_ops);
a4bcff5b
LP
2655 if (!f->chain_cache) {
2656 r = -ENOMEM;
2657 goto fail;
2658 }
2659
0284adc6
LP
2660 f->fd = open(f->path, f->flags|O_CLOEXEC, f->mode);
2661 if (f->fd < 0) {
2662 r = -errno;
2663 goto fail;
7560fffc 2664 }
7560fffc 2665
2678031a
LP
2666 r = journal_file_fstat(f);
2667 if (r < 0)
0284adc6 2668 goto fail;
7560fffc 2669
0284adc6 2670 if (f->last_stat.st_size == 0 && f->writable) {
11689d2a 2671
fc68c929 2672 (void) journal_file_warn_btrfs(f);
11689d2a 2673
fb0951b0
LP
2674 /* Let's attach the creation time to the journal file,
2675 * so that the vacuuming code knows the age of this
2676 * file even if the file might end up corrupted one
2677 * day... Ideally we'd just use the creation time many
2678 * file systems maintain for each file, but there is
2679 * currently no usable API to query this, hence let's
2680 * emulate this via extended attributes. If extended
2681 * attributes are not supported we'll just skip this,
7517e174 2682 * and rely solely on mtime/atime/ctime of the file. */
fb0951b0 2683
d61b600d 2684 fd_setcrtime(f->fd, 0);
7560fffc 2685
feb12d3e 2686#ifdef HAVE_GCRYPT
0284adc6 2687 /* Try to load the FSPRG state, and if we can't, then
baed47c3 2688 * just don't do sealing */
49a32d43
LP
2689 if (f->seal) {
2690 r = journal_file_fss_load(f);
2691 if (r < 0)
2692 f->seal = false;
2693 }
feb12d3e 2694#endif
7560fffc 2695
0284adc6
LP
2696 r = journal_file_init_header(f, template);
2697 if (r < 0)
2698 goto fail;
7560fffc 2699
2678031a
LP
2700 r = journal_file_fstat(f);
2701 if (r < 0)
0284adc6 2702 goto fail;
fb0951b0
LP
2703
2704 newly_created = true;
0284adc6 2705 }
7560fffc 2706
0284adc6 2707 if (f->last_stat.st_size < (off_t) HEADER_SIZE_MIN) {
cfb571f3 2708 r = -ENODATA;
0284adc6
LP
2709 goto fail;
2710 }
7560fffc 2711
fa6ac760 2712 r = mmap_cache_get(f->mmap, f->fd, f->prot, CONTEXT_HEADER, true, 0, PAGE_ALIGN(sizeof(Header)), &f->last_stat, &h);
977eaa1e 2713 if (r < 0)
0284adc6 2714 goto fail;
7560fffc 2715
fa6ac760
LP
2716 f->header = h;
2717
0284adc6
LP
2718 if (!newly_created) {
2719 r = journal_file_verify_header(f);
2720 if (r < 0)
2721 goto fail;
2722 }
7560fffc 2723
feb12d3e 2724#ifdef HAVE_GCRYPT
0284adc6 2725 if (!newly_created && f->writable) {
baed47c3 2726 r = journal_file_fss_load(f);
0284adc6
LP
2727 if (r < 0)
2728 goto fail;
2729 }
feb12d3e 2730#endif
cec736d2
LP
2731
2732 if (f->writable) {
4a92baf3
LP
2733 if (metrics) {
2734 journal_default_metrics(metrics, f->fd);
2735 f->metrics = *metrics;
2736 } else if (template)
2737 f->metrics = template->metrics;
2738
cec736d2
LP
2739 r = journal_file_refresh_header(f);
2740 if (r < 0)
2741 goto fail;
2742 }
2743
feb12d3e 2744#ifdef HAVE_GCRYPT
baed47c3 2745 r = journal_file_hmac_setup(f);
14d10188
LP
2746 if (r < 0)
2747 goto fail;
feb12d3e 2748#endif
14d10188 2749
cec736d2 2750 if (newly_created) {
de190aef 2751 r = journal_file_setup_field_hash_table(f);
cec736d2
LP
2752 if (r < 0)
2753 goto fail;
2754
de190aef 2755 r = journal_file_setup_data_hash_table(f);
cec736d2
LP
2756 if (r < 0)
2757 goto fail;
7560fffc 2758
feb12d3e 2759#ifdef HAVE_GCRYPT
7560fffc
LP
2760 r = journal_file_append_first_tag(f);
2761 if (r < 0)
2762 goto fail;
feb12d3e 2763#endif
cec736d2
LP
2764 }
2765
fa6ac760
LP
2766 if (mmap_cache_got_sigbus(f->mmap, f->fd)) {
2767 r = -EIO;
2768 goto fail;
2769 }
2770
0559d3a5 2771 *ret = f;
cec736d2
LP
2772 return 0;
2773
2774fail:
fa6ac760
LP
2775 if (f->fd >= 0 && mmap_cache_got_sigbus(f->mmap, f->fd))
2776 r = -EIO;
2777
cec736d2
LP
2778 journal_file_close(f);
2779
2780 return r;
2781}
0ac38b70 2782
baed47c3 2783int journal_file_rotate(JournalFile **f, bool compress, bool seal) {
57535f47 2784 _cleanup_free_ char *p = NULL;
0ac38b70
LP
2785 size_t l;
2786 JournalFile *old_file, *new_file = NULL;
2787 int r;
2788
2789 assert(f);
2790 assert(*f);
2791
2792 old_file = *f;
2793
2794 if (!old_file->writable)
2795 return -EINVAL;
2796
2797 if (!endswith(old_file->path, ".journal"))
2798 return -EINVAL;
2799
2800 l = strlen(old_file->path);
57535f47
ZJS
2801 r = asprintf(&p, "%.*s@" SD_ID128_FORMAT_STR "-%016"PRIx64"-%016"PRIx64".journal",
2802 (int) l - 8, old_file->path,
2803 SD_ID128_FORMAT_VAL(old_file->header->seqnum_id),
2804 le64toh((*f)->header->head_entry_seqnum),
2805 le64toh((*f)->header->head_entry_realtime));
2806 if (r < 0)
0ac38b70
LP
2807 return -ENOMEM;
2808
2678031a
LP
2809 /* Try to rename the file to the archived version. If the file
2810 * already was deleted, we'll get ENOENT, let's ignore that
2811 * case. */
0ac38b70 2812 r = rename(old_file->path, p);
2678031a 2813 if (r < 0 && errno != ENOENT)
0ac38b70
LP
2814 return -errno;
2815
ccdbaf91 2816 old_file->header->state = STATE_ARCHIVED;
0ac38b70 2817
f27a3864
LP
2818 /* Currently, btrfs is not very good with out write patterns
2819 * and fragments heavily. Let's defrag our journal files when
2820 * we archive them */
2821 old_file->defrag_on_close = true;
2822
baed47c3 2823 r = journal_file_open(old_file->path, old_file->flags, old_file->mode, compress, seal, NULL, old_file->mmap, old_file, &new_file);
0ac38b70
LP
2824 journal_file_close(old_file);
2825
2826 *f = new_file;
2827 return r;
2828}
2829
9447a7f1
LP
2830int journal_file_open_reliably(
2831 const char *fname,
2832 int flags,
2833 mode_t mode,
7560fffc 2834 bool compress,
baed47c3 2835 bool seal,
4a92baf3 2836 JournalMetrics *metrics,
27370278 2837 MMapCache *mmap_cache,
9447a7f1
LP
2838 JournalFile *template,
2839 JournalFile **ret) {
2840
2841 int r;
2842 size_t l;
ed375beb 2843 _cleanup_free_ char *p = NULL;
9447a7f1 2844
070052ab 2845 r = journal_file_open(fname, flags, mode, compress, seal, metrics, mmap_cache, template, ret);
288359db
ZJS
2846 if (!IN_SET(r,
2847 -EBADMSG, /* corrupted */
2848 -ENODATA, /* truncated */
2849 -EHOSTDOWN, /* other machine */
2850 -EPROTONOSUPPORT, /* incompatible feature */
2851 -EBUSY, /* unclean shutdown */
2852 -ESHUTDOWN, /* already archived */
2853 -EIO, /* IO error, including SIGBUS on mmap */
2854 -EIDRM /* File has been deleted */))
9447a7f1
LP
2855 return r;
2856
2857 if ((flags & O_ACCMODE) == O_RDONLY)
2858 return r;
2859
2860 if (!(flags & O_CREAT))
2861 return r;
2862
7560fffc
LP
2863 if (!endswith(fname, ".journal"))
2864 return r;
2865
5c70eab4
LP
2866 /* The file is corrupted. Rotate it away and try it again (but only once) */
2867
9447a7f1 2868 l = strlen(fname);
d587eca5 2869 if (asprintf(&p, "%.*s@%016"PRIx64 "-%016"PRIx64 ".journal~",
57535f47 2870 (int) l - 8, fname,
d587eca5 2871 now(CLOCK_REALTIME),
9bf3b535 2872 random_u64()) < 0)
9447a7f1
LP
2873 return -ENOMEM;
2874
65089b82 2875 if (rename(fname, p) < 0)
9447a7f1
LP
2876 return -errno;
2877
f27a3864
LP
2878 /* btrfs doesn't cope well with our write pattern and
2879 * fragments heavily. Let's defrag all files we rotate */
11689d2a
LP
2880
2881 (void) chattr_path(p, false, FS_NOCOW_FL);
f27a3864
LP
2882 (void) btrfs_defrag(p);
2883
65089b82 2884 log_warning_errno(r, "File %s corrupted or uncleanly shut down, renaming and replacing.", fname);
9447a7f1 2885
070052ab 2886 return journal_file_open(fname, flags, mode, compress, seal, metrics, mmap_cache, template, ret);
9447a7f1
LP
2887}
2888
cf244689
LP
2889int journal_file_copy_entry(JournalFile *from, JournalFile *to, Object *o, uint64_t p, uint64_t *seqnum, Object **ret, uint64_t *offset) {
2890 uint64_t i, n;
2891 uint64_t q, xor_hash = 0;
2892 int r;
2893 EntryItem *items;
2894 dual_timestamp ts;
2895
2896 assert(from);
2897 assert(to);
2898 assert(o);
2899 assert(p);
2900
2901 if (!to->writable)
2902 return -EPERM;
2903
2904 ts.monotonic = le64toh(o->entry.monotonic);
2905 ts.realtime = le64toh(o->entry.realtime);
2906
cf244689 2907 n = journal_file_entry_n_items(o);
4faa7004
TA
2908 /* alloca() can't take 0, hence let's allocate at least one */
2909 items = alloca(sizeof(EntryItem) * MAX(1u, n));
cf244689
LP
2910
2911 for (i = 0; i < n; i++) {
4fd052ae
FC
2912 uint64_t l, h;
2913 le64_t le_hash;
cf244689
LP
2914 size_t t;
2915 void *data;
2916 Object *u;
2917
2918 q = le64toh(o->entry.items[i].object_offset);
2919 le_hash = o->entry.items[i].hash;
2920
2921 r = journal_file_move_to_object(from, OBJECT_DATA, q, &o);
2922 if (r < 0)
2923 return r;
2924
2925 if (le_hash != o->data.hash)
2926 return -EBADMSG;
2927
2928 l = le64toh(o->object.size) - offsetof(Object, data.payload);
2929 t = (size_t) l;
2930
2931 /* We hit the limit on 32bit machines */
2932 if ((uint64_t) t != l)
2933 return -E2BIG;
2934
d89c8fdf 2935 if (o->object.flags & OBJECT_COMPRESSION_MASK) {
3b1a55e1 2936#if defined(HAVE_XZ) || defined(HAVE_LZ4)
a7f7d1bd 2937 size_t rsize = 0;
cf244689 2938
d89c8fdf
ZJS
2939 r = decompress_blob(o->object.flags & OBJECT_COMPRESSION_MASK,
2940 o->data.payload, l, &from->compress_buffer, &from->compress_buffer_size, &rsize, 0);
2941 if (r < 0)
2942 return r;
cf244689
LP
2943
2944 data = from->compress_buffer;
2945 l = rsize;
3b1a55e1
ZJS
2946#else
2947 return -EPROTONOSUPPORT;
2948#endif
cf244689
LP
2949 } else
2950 data = o->data.payload;
2951
2952 r = journal_file_append_data(to, data, l, &u, &h);
2953 if (r < 0)
2954 return r;
2955
2956 xor_hash ^= le64toh(u->data.hash);
2957 items[i].object_offset = htole64(h);
2958 items[i].hash = u->data.hash;
2959
2960 r = journal_file_move_to_object(from, OBJECT_ENTRY, p, &o);
2961 if (r < 0)
2962 return r;
2963 }
2964
fa6ac760
LP
2965 r = journal_file_append_entry_internal(to, &ts, xor_hash, items, n, seqnum, ret, offset);
2966
2967 if (mmap_cache_got_sigbus(to->mmap, to->fd))
2968 return -EIO;
2969
2970 return r;
cf244689 2971}
babfc091 2972
8580d1f7
LP
2973void journal_reset_metrics(JournalMetrics *m) {
2974 assert(m);
2975
2976 /* Set everything to "pick automatic values". */
2977
2978 *m = (JournalMetrics) {
2979 .min_use = (uint64_t) -1,
2980 .max_use = (uint64_t) -1,
2981 .min_size = (uint64_t) -1,
2982 .max_size = (uint64_t) -1,
2983 .keep_free = (uint64_t) -1,
2984 .n_max_files = (uint64_t) -1,
2985 };
2986}
2987
babfc091 2988void journal_default_metrics(JournalMetrics *m, int fd) {
8580d1f7 2989 char a[FORMAT_BYTES_MAX], b[FORMAT_BYTES_MAX], c[FORMAT_BYTES_MAX], d[FORMAT_BYTES_MAX], e[FORMAT_BYTES_MAX];
babfc091 2990 struct statvfs ss;
8580d1f7 2991 uint64_t fs_size;
babfc091
LP
2992
2993 assert(m);
2994 assert(fd >= 0);
2995
2996 if (fstatvfs(fd, &ss) >= 0)
2997 fs_size = ss.f_frsize * ss.f_blocks;
8580d1f7
LP
2998 else {
2999 log_debug_errno(errno, "Failed to detremine disk size: %m");
3000 fs_size = 0;
3001 }
babfc091
LP
3002
3003 if (m->max_use == (uint64_t) -1) {
3004
3005 if (fs_size > 0) {
3006 m->max_use = PAGE_ALIGN(fs_size / 10); /* 10% of file system size */
3007
3008 if (m->max_use > DEFAULT_MAX_USE_UPPER)
3009 m->max_use = DEFAULT_MAX_USE_UPPER;
3010
3011 if (m->max_use < DEFAULT_MAX_USE_LOWER)
3012 m->max_use = DEFAULT_MAX_USE_LOWER;
3013 } else
3014 m->max_use = DEFAULT_MAX_USE_LOWER;
3015 } else {
3016 m->max_use = PAGE_ALIGN(m->max_use);
3017
8580d1f7 3018 if (m->max_use != 0 && m->max_use < JOURNAL_FILE_SIZE_MIN*2)
babfc091
LP
3019 m->max_use = JOURNAL_FILE_SIZE_MIN*2;
3020 }
3021
8580d1f7
LP
3022 if (m->min_use == (uint64_t) -1)
3023 m->min_use = DEFAULT_MIN_USE;
3024
3025 if (m->min_use > m->max_use)
3026 m->min_use = m->max_use;
3027
babfc091
LP
3028 if (m->max_size == (uint64_t) -1) {
3029 m->max_size = PAGE_ALIGN(m->max_use / 8); /* 8 chunks */
3030
3031 if (m->max_size > DEFAULT_MAX_SIZE_UPPER)
3032 m->max_size = DEFAULT_MAX_SIZE_UPPER;
3033 } else
3034 m->max_size = PAGE_ALIGN(m->max_size);
3035
8580d1f7
LP
3036 if (m->max_size != 0) {
3037 if (m->max_size < JOURNAL_FILE_SIZE_MIN)
3038 m->max_size = JOURNAL_FILE_SIZE_MIN;
babfc091 3039
8580d1f7
LP
3040 if (m->max_use != 0 && m->max_size*2 > m->max_use)
3041 m->max_use = m->max_size*2;
3042 }
babfc091
LP
3043
3044 if (m->min_size == (uint64_t) -1)
3045 m->min_size = JOURNAL_FILE_SIZE_MIN;
3046 else {
3047 m->min_size = PAGE_ALIGN(m->min_size);
3048
3049 if (m->min_size < JOURNAL_FILE_SIZE_MIN)
3050 m->min_size = JOURNAL_FILE_SIZE_MIN;
3051
8580d1f7 3052 if (m->max_size != 0 && m->min_size > m->max_size)
babfc091
LP
3053 m->max_size = m->min_size;
3054 }
3055
3056 if (m->keep_free == (uint64_t) -1) {
3057
3058 if (fs_size > 0) {
8621b110 3059 m->keep_free = PAGE_ALIGN(fs_size * 3 / 20); /* 15% of file system size */
babfc091
LP
3060
3061 if (m->keep_free > DEFAULT_KEEP_FREE_UPPER)
3062 m->keep_free = DEFAULT_KEEP_FREE_UPPER;
3063
3064 } else
3065 m->keep_free = DEFAULT_KEEP_FREE;
3066 }
3067
8580d1f7
LP
3068 if (m->n_max_files == (uint64_t) -1)
3069 m->n_max_files = DEFAULT_N_MAX_FILES;
3070
3071 log_debug("Fixed min_use=%s max_use=%s max_size=%s min_size=%s keep_free=%s n_max_files=%" PRIu64,
3072 format_bytes(a, sizeof(a), m->min_use),
3073 format_bytes(b, sizeof(b), m->max_use),
3074 format_bytes(c, sizeof(c), m->max_size),
3075 format_bytes(d, sizeof(d), m->min_size),
3076 format_bytes(e, sizeof(e), m->keep_free),
3077 m->n_max_files);
babfc091 3078}
08984293
LP
3079
3080int journal_file_get_cutoff_realtime_usec(JournalFile *f, usec_t *from, usec_t *to) {
08984293
LP
3081 assert(f);
3082 assert(from || to);
3083
3084 if (from) {
162566a4
LP
3085 if (f->header->head_entry_realtime == 0)
3086 return -ENOENT;
08984293 3087
162566a4 3088 *from = le64toh(f->header->head_entry_realtime);
08984293
LP
3089 }
3090
3091 if (to) {
162566a4
LP
3092 if (f->header->tail_entry_realtime == 0)
3093 return -ENOENT;
08984293 3094
162566a4 3095 *to = le64toh(f->header->tail_entry_realtime);
08984293
LP
3096 }
3097
3098 return 1;
3099}
3100
3101int journal_file_get_cutoff_monotonic_usec(JournalFile *f, sd_id128_t boot_id, usec_t *from, usec_t *to) {
08984293
LP
3102 Object *o;
3103 uint64_t p;
3104 int r;
3105
3106 assert(f);
3107 assert(from || to);
3108
47838ab3 3109 r = find_data_object_by_boot_id(f, boot_id, &o, &p);
08984293
LP
3110 if (r <= 0)
3111 return r;
3112
3113 if (le64toh(o->data.n_entries) <= 0)
3114 return 0;
3115
3116 if (from) {
3117 r = journal_file_move_to_object(f, OBJECT_ENTRY, le64toh(o->data.entry_offset), &o);
3118 if (r < 0)
3119 return r;
3120
3121 *from = le64toh(o->entry.monotonic);
3122 }
3123
3124 if (to) {
3125 r = journal_file_move_to_object(f, OBJECT_DATA, p, &o);
3126 if (r < 0)
3127 return r;
3128
3129 r = generic_array_get_plus_one(f,
3130 le64toh(o->data.entry_offset),
3131 le64toh(o->data.entry_array_offset),
3132 le64toh(o->data.n_entries)-1,
3133 &o, NULL);
3134 if (r <= 0)
3135 return r;
3136
3137 *to = le64toh(o->entry.monotonic);
3138 }
3139
3140 return 1;
3141}
dca6219e 3142
fb0951b0 3143bool journal_file_rotate_suggested(JournalFile *f, usec_t max_file_usec) {
dca6219e
LP
3144 assert(f);
3145
3146 /* If we gained new header fields we gained new features,
3147 * hence suggest a rotation */
361f9cbc
LP
3148 if (le64toh(f->header->header_size) < sizeof(Header)) {
3149 log_debug("%s uses an outdated header, suggesting rotation.", f->path);
dca6219e 3150 return true;
361f9cbc 3151 }
dca6219e
LP
3152
3153 /* Let's check if the hash tables grew over a certain fill
3154 * level (75%, borrowing this value from Java's hash table
3155 * implementation), and if so suggest a rotation. To calculate
3156 * the fill level we need the n_data field, which only exists
3157 * in newer versions. */
3158
3159 if (JOURNAL_HEADER_CONTAINS(f->header, n_data))
361f9cbc 3160 if (le64toh(f->header->n_data) * 4ULL > (le64toh(f->header->data_hash_table_size) / sizeof(HashItem)) * 3ULL) {
507f22bd 3161 log_debug("Data hash table of %s has a fill level at %.1f (%"PRIu64" of %"PRIu64" items, %llu file size, %"PRIu64" bytes per hash table item), suggesting rotation.",
361f9cbc
LP
3162 f->path,
3163 100.0 * (double) le64toh(f->header->n_data) / ((double) (le64toh(f->header->data_hash_table_size) / sizeof(HashItem))),
507f22bd
ZJS
3164 le64toh(f->header->n_data),
3165 le64toh(f->header->data_hash_table_size) / sizeof(HashItem),
3166 (unsigned long long) f->last_stat.st_size,
3167 f->last_stat.st_size / le64toh(f->header->n_data));
dca6219e 3168 return true;
361f9cbc 3169 }
dca6219e
LP
3170
3171 if (JOURNAL_HEADER_CONTAINS(f->header, n_fields))
361f9cbc 3172 if (le64toh(f->header->n_fields) * 4ULL > (le64toh(f->header->field_hash_table_size) / sizeof(HashItem)) * 3ULL) {
507f22bd 3173 log_debug("Field hash table of %s has a fill level at %.1f (%"PRIu64" of %"PRIu64" items), suggesting rotation.",
361f9cbc
LP
3174 f->path,
3175 100.0 * (double) le64toh(f->header->n_fields) / ((double) (le64toh(f->header->field_hash_table_size) / sizeof(HashItem))),
507f22bd
ZJS
3176 le64toh(f->header->n_fields),
3177 le64toh(f->header->field_hash_table_size) / sizeof(HashItem));
dca6219e 3178 return true;
361f9cbc 3179 }
dca6219e 3180
0598fd4a
LP
3181 /* Are the data objects properly indexed by field objects? */
3182 if (JOURNAL_HEADER_CONTAINS(f->header, n_data) &&
3183 JOURNAL_HEADER_CONTAINS(f->header, n_fields) &&
3184 le64toh(f->header->n_data) > 0 &&
3185 le64toh(f->header->n_fields) == 0)
3186 return true;
3187
fb0951b0
LP
3188 if (max_file_usec > 0) {
3189 usec_t t, h;
3190
3191 h = le64toh(f->header->head_entry_realtime);
3192 t = now(CLOCK_REALTIME);
3193
3194 if (h > 0 && t > h + max_file_usec)
3195 return true;
3196 }
3197
dca6219e
LP
3198 return false;
3199}