]> git.ipfire.org Git - thirdparty/systemd.git/blame - src/journal/journal-file.c
util-lib: split out fd-related operations into fd-util.[ch]
[thirdparty/systemd.git] / src / journal / journal-file.c
CommitLineData
cec736d2
LP
1/*-*- Mode: C; c-basic-offset: 8; indent-tabs-mode: nil -*-*/
2
3/***
4 This file is part of systemd.
5
6 Copyright 2011 Lennart Poettering
7
8 systemd is free software; you can redistribute it and/or modify it
5430f7f2
LP
9 under the terms of the GNU Lesser General Public License as published by
10 the Free Software Foundation; either version 2.1 of the License, or
cec736d2
LP
11 (at your option) any later version.
12
13 systemd is distributed in the hope that it will be useful, but
14 WITHOUT ANY WARRANTY; without even the implied warranty of
15 MERCHANTABILITY or FITNESS FOR A PARTICULAR PURPOSE. See the GNU
5430f7f2 16 Lesser General Public License for more details.
cec736d2 17
5430f7f2 18 You should have received a copy of the GNU Lesser General Public License
cec736d2
LP
19 along with systemd; If not, see <http://www.gnu.org/licenses/>.
20***/
21
cec736d2 22#include <errno.h>
cec736d2 23#include <fcntl.h>
11689d2a 24#include <linux/fs.h>
07630cea
LP
25#include <stddef.h>
26#include <sys/mman.h>
27#include <sys/statvfs.h>
28#include <sys/uio.h>
29#include <unistd.h>
fb0951b0 30
f27a3864 31#include "btrfs-util.h"
07630cea 32#include "compress.h"
3ffd4af2 33#include "fd-util.h"
0284adc6 34#include "journal-authenticate.h"
07630cea 35#include "journal-def.h"
3ffd4af2 36#include "journal-file.h"
cec736d2 37#include "lookup3.h"
3df3e884 38#include "random-util.h"
07630cea 39#include "string-util.h"
cec736d2 40
4a92baf3
LP
41#define DEFAULT_DATA_HASH_TABLE_SIZE (2047ULL*sizeof(HashItem))
42#define DEFAULT_FIELD_HASH_TABLE_SIZE (333ULL*sizeof(HashItem))
cec736d2 43
be19b7df 44#define COMPRESSION_SIZE_THRESHOLD (512ULL)
807e17f0 45
babfc091 46/* This is the minimum journal file size */
253f59df 47#define JOURNAL_FILE_SIZE_MIN (4ULL*1024ULL*1024ULL) /* 4 MiB */
babfc091
LP
48
49/* These are the lower and upper bounds if we deduce the max_use value
50 * from the file system size */
51#define DEFAULT_MAX_USE_LOWER (1ULL*1024ULL*1024ULL) /* 1 MiB */
52#define DEFAULT_MAX_USE_UPPER (4ULL*1024ULL*1024ULL*1024ULL) /* 4 GiB */
53
8580d1f7
LP
54/* This is the default minimal use limit, how much we'll use even if keep_free suggests otherwise. */
55#define DEFAULT_MIN_USE (1ULL*1024ULL*1024ULL) /* 1 MiB */
56
babfc091 57/* This is the upper bound if we deduce max_size from max_use */
71100051 58#define DEFAULT_MAX_SIZE_UPPER (128ULL*1024ULL*1024ULL) /* 128 MiB */
babfc091
LP
59
60/* This is the upper bound if we deduce the keep_free value from the
61 * file system size */
62#define DEFAULT_KEEP_FREE_UPPER (4ULL*1024ULL*1024ULL*1024ULL) /* 4 GiB */
63
64/* This is the keep_free value when we can't determine the system
65 * size */
66#define DEFAULT_KEEP_FREE (1024ULL*1024ULL) /* 1 MB */
67
8580d1f7
LP
68/* This is the default maximum number of journal files to keep around. */
69#define DEFAULT_N_MAX_FILES (100)
70
dca6219e
LP
71/* n_data was the first entry we added after the initial file format design */
72#define HEADER_SIZE_MIN ALIGN64(offsetof(Header, n_data))
cec736d2 73
a4bcff5b
LP
74/* How many entries to keep in the entry array chain cache at max */
75#define CHAIN_CACHE_MAX 20
76
a676e665
LP
77/* How much to increase the journal file size at once each time we allocate something new. */
78#define FILE_SIZE_INCREASE (8ULL*1024ULL*1024ULL) /* 8MB */
79
2678031a
LP
80/* Reread fstat() of the file for detecting deletions at least this often */
81#define LAST_STAT_REFRESH_USEC (5*USEC_PER_SEC)
82
fa6ac760
LP
83/* The mmap context to use for the header we pick as one above the last defined typed */
84#define CONTEXT_HEADER _OBJECT_TYPE_MAX
85
9588bc32 86static int journal_file_set_online(JournalFile *f) {
26687bf8
OS
87 assert(f);
88
89 if (!f->writable)
90 return -EPERM;
91
92 if (!(f->fd >= 0 && f->header))
93 return -EINVAL;
94
fa6ac760
LP
95 if (mmap_cache_got_sigbus(f->mmap, f->fd))
96 return -EIO;
97
26687bf8
OS
98 switch(f->header->state) {
99 case STATE_ONLINE:
100 return 0;
101
102 case STATE_OFFLINE:
103 f->header->state = STATE_ONLINE;
104 fsync(f->fd);
105 return 0;
106
107 default:
108 return -EINVAL;
109 }
110}
111
112int journal_file_set_offline(JournalFile *f) {
113 assert(f);
114
115 if (!f->writable)
116 return -EPERM;
117
118 if (!(f->fd >= 0 && f->header))
119 return -EINVAL;
120
121 if (f->header->state != STATE_ONLINE)
122 return 0;
123
124 fsync(f->fd);
125
fa6ac760
LP
126 if (mmap_cache_got_sigbus(f->mmap, f->fd))
127 return -EIO;
128
26687bf8
OS
129 f->header->state = STATE_OFFLINE;
130
fa6ac760
LP
131 if (mmap_cache_got_sigbus(f->mmap, f->fd))
132 return -EIO;
133
26687bf8
OS
134 fsync(f->fd);
135
136 return 0;
137}
138
804ae586 139JournalFile* journal_file_close(JournalFile *f) {
de190aef 140 assert(f);
cec736d2 141
feb12d3e 142#ifdef HAVE_GCRYPT
b0af6f41 143 /* Write the final tag */
c586dbf1 144 if (f->seal && f->writable)
b0af6f41 145 journal_file_append_tag(f);
feb12d3e 146#endif
b0af6f41 147
26687bf8 148 journal_file_set_offline(f);
cec736d2 149
fa6ac760
LP
150 if (f->mmap && f->fd >= 0)
151 mmap_cache_close_fd(f->mmap, f->fd);
cec736d2 152
11689d2a
LP
153 if (f->fd >= 0 && f->defrag_on_close) {
154
155 /* Be friendly to btrfs: turn COW back on again now,
156 * and defragment the file. We won't write to the file
157 * ever again, hence remove all fragmentation, and
158 * reenable all the good bits COW usually provides
159 * (such as data checksumming). */
160
1ed8f8c1 161 (void) chattr_fd(f->fd, 0, FS_NOCOW_FL);
11689d2a
LP
162 (void) btrfs_defrag_fd(f->fd);
163 }
f27a3864 164
03e334a1 165 safe_close(f->fd);
cec736d2 166 free(f->path);
807e17f0 167
16e9f408
LP
168 if (f->mmap)
169 mmap_cache_unref(f->mmap);
170
4743015d 171 ordered_hashmap_free_free(f->chain_cache);
a4bcff5b 172
d89c8fdf 173#if defined(HAVE_XZ) || defined(HAVE_LZ4)
807e17f0
LP
174 free(f->compress_buffer);
175#endif
176
7560fffc 177#ifdef HAVE_GCRYPT
baed47c3
LP
178 if (f->fss_file)
179 munmap(f->fss_file, PAGE_ALIGN(f->fss_file_size));
dc4ebc07 180 else
b7c9ae91
LP
181 free(f->fsprg_state);
182
183 free(f->fsprg_seed);
7560fffc
LP
184
185 if (f->hmac)
186 gcry_md_close(f->hmac);
187#endif
188
cec736d2 189 free(f);
804ae586 190 return NULL;
cec736d2
LP
191}
192
0ac38b70 193static int journal_file_init_header(JournalFile *f, JournalFile *template) {
d89c8fdf 194 Header h = {};
cec736d2
LP
195 ssize_t k;
196 int r;
197
198 assert(f);
199
7560fffc 200 memcpy(h.signature, HEADER_SIGNATURE, 8);
23b0b2b2 201 h.header_size = htole64(ALIGN64(sizeof(h)));
cec736d2 202
d89c8fdf
ZJS
203 h.incompatible_flags |= htole32(
204 f->compress_xz * HEADER_INCOMPATIBLE_COMPRESSED_XZ |
205 f->compress_lz4 * HEADER_INCOMPATIBLE_COMPRESSED_LZ4);
7560fffc 206
d89c8fdf
ZJS
207 h.compatible_flags = htole32(
208 f->seal * HEADER_COMPATIBLE_SEALED);
7560fffc 209
cec736d2
LP
210 r = sd_id128_randomize(&h.file_id);
211 if (r < 0)
212 return r;
213
0ac38b70
LP
214 if (template) {
215 h.seqnum_id = template->header->seqnum_id;
beec0085 216 h.tail_entry_seqnum = template->header->tail_entry_seqnum;
0ac38b70
LP
217 } else
218 h.seqnum_id = h.file_id;
cec736d2
LP
219
220 k = pwrite(f->fd, &h, sizeof(h), 0);
221 if (k < 0)
222 return -errno;
223
224 if (k != sizeof(h))
225 return -EIO;
226
227 return 0;
228}
229
230static int journal_file_refresh_header(JournalFile *f) {
de190aef 231 sd_id128_t boot_id;
fa6ac760 232 int r;
cec736d2
LP
233
234 assert(f);
235
236 r = sd_id128_get_machine(&f->header->machine_id);
237 if (r < 0)
238 return r;
239
de190aef 240 r = sd_id128_get_boot(&boot_id);
cec736d2
LP
241 if (r < 0)
242 return r;
243
de190aef
LP
244 if (sd_id128_equal(boot_id, f->header->boot_id))
245 f->tail_entry_monotonic_valid = true;
246
247 f->header->boot_id = boot_id;
248
fa6ac760 249 r = journal_file_set_online(f);
b788cc23 250
7560fffc 251 /* Sync the online state to disk */
a676e665 252 fsync(f->fd);
b788cc23 253
fa6ac760 254 return r;
cec736d2
LP
255}
256
257static int journal_file_verify_header(JournalFile *f) {
d89c8fdf
ZJS
258 uint32_t flags;
259
cec736d2
LP
260 assert(f);
261
7560fffc 262 if (memcmp(f->header->signature, HEADER_SIGNATURE, 8))
cec736d2
LP
263 return -EBADMSG;
264
7560fffc
LP
265 /* In both read and write mode we refuse to open files with
266 * incompatible flags we don't know */
d89c8fdf
ZJS
267 flags = le32toh(f->header->incompatible_flags);
268 if (flags & ~HEADER_INCOMPATIBLE_SUPPORTED) {
269 if (flags & ~HEADER_INCOMPATIBLE_ANY)
270 log_debug("Journal file %s has unknown incompatible flags %"PRIx32,
271 f->path, flags & ~HEADER_INCOMPATIBLE_ANY);
272 flags = (flags & HEADER_INCOMPATIBLE_ANY) & ~HEADER_INCOMPATIBLE_SUPPORTED;
273 if (flags)
274 log_debug("Journal file %s uses incompatible flags %"PRIx32
275 " disabled at compilation time.", f->path, flags);
cec736d2 276 return -EPROTONOSUPPORT;
d89c8fdf 277 }
cec736d2 278
7560fffc
LP
279 /* When open for writing we refuse to open files with
280 * compatible flags, too */
d89c8fdf
ZJS
281 flags = le32toh(f->header->compatible_flags);
282 if (f->writable && (flags & ~HEADER_COMPATIBLE_SUPPORTED)) {
283 if (flags & ~HEADER_COMPATIBLE_ANY)
284 log_debug("Journal file %s has unknown compatible flags %"PRIx32,
285 f->path, flags & ~HEADER_COMPATIBLE_ANY);
286 flags = (flags & HEADER_COMPATIBLE_ANY) & ~HEADER_COMPATIBLE_SUPPORTED;
287 if (flags)
288 log_debug("Journal file %s uses compatible flags %"PRIx32
289 " disabled at compilation time.", f->path, flags);
290 return -EPROTONOSUPPORT;
7560fffc
LP
291 }
292
db11ac1a
LP
293 if (f->header->state >= _STATE_MAX)
294 return -EBADMSG;
295
dca6219e
LP
296 /* The first addition was n_data, so check that we are at least this large */
297 if (le64toh(f->header->header_size) < HEADER_SIZE_MIN)
23b0b2b2
LP
298 return -EBADMSG;
299
8088cbd3 300 if (JOURNAL_HEADER_SEALED(f->header) && !JOURNAL_HEADER_CONTAINS(f->header, n_entry_arrays))
beec0085
LP
301 return -EBADMSG;
302
db11ac1a
LP
303 if ((le64toh(f->header->header_size) + le64toh(f->header->arena_size)) > (uint64_t) f->last_stat.st_size)
304 return -ENODATA;
305
306 if (le64toh(f->header->tail_object_offset) > (le64toh(f->header->header_size) + le64toh(f->header->arena_size)))
307 return -ENODATA;
308
7762e02b
LP
309 if (!VALID64(le64toh(f->header->data_hash_table_offset)) ||
310 !VALID64(le64toh(f->header->field_hash_table_offset)) ||
311 !VALID64(le64toh(f->header->tail_object_offset)) ||
312 !VALID64(le64toh(f->header->entry_array_offset)))
313 return -ENODATA;
314
cec736d2 315 if (f->writable) {
ccdbaf91 316 uint8_t state;
cec736d2
LP
317 sd_id128_t machine_id;
318 int r;
319
320 r = sd_id128_get_machine(&machine_id);
321 if (r < 0)
322 return r;
323
324 if (!sd_id128_equal(machine_id, f->header->machine_id))
325 return -EHOSTDOWN;
326
de190aef 327 state = f->header->state;
cec736d2 328
71fa6f00
LP
329 if (state == STATE_ONLINE) {
330 log_debug("Journal file %s is already online. Assuming unclean closing.", f->path);
331 return -EBUSY;
332 } else if (state == STATE_ARCHIVED)
cec736d2 333 return -ESHUTDOWN;
71fa6f00 334 else if (state != STATE_OFFLINE) {
8facc349 335 log_debug("Journal file %s has unknown state %i.", f->path, state);
71fa6f00
LP
336 return -EBUSY;
337 }
cec736d2
LP
338 }
339
d89c8fdf
ZJS
340 f->compress_xz = JOURNAL_HEADER_COMPRESSED_XZ(f->header);
341 f->compress_lz4 = JOURNAL_HEADER_COMPRESSED_LZ4(f->header);
c586dbf1 342
f1889c91 343 f->seal = JOURNAL_HEADER_SEALED(f->header);
7560fffc 344
cec736d2
LP
345 return 0;
346}
347
2678031a
LP
348static int journal_file_fstat(JournalFile *f) {
349 assert(f);
350 assert(f->fd >= 0);
351
352 if (fstat(f->fd, &f->last_stat) < 0)
353 return -errno;
354
355 f->last_stat_usec = now(CLOCK_MONOTONIC);
356
357 /* Refuse appending to files that are already deleted */
358 if (f->last_stat.st_nlink <= 0)
359 return -EIDRM;
360
361 return 0;
362}
363
cec736d2 364static int journal_file_allocate(JournalFile *f, uint64_t offset, uint64_t size) {
eda4b58b 365 uint64_t old_size, new_size;
fec2aa2f 366 int r;
cec736d2
LP
367
368 assert(f);
369
cec736d2 370 /* We assume that this file is not sparse, and we know that
38ac38b2 371 * for sure, since we always call posix_fallocate()
cec736d2
LP
372 * ourselves */
373
fa6ac760
LP
374 if (mmap_cache_got_sigbus(f->mmap, f->fd))
375 return -EIO;
376
cec736d2 377 old_size =
23b0b2b2 378 le64toh(f->header->header_size) +
cec736d2
LP
379 le64toh(f->header->arena_size);
380
bc85bfee 381 new_size = PAGE_ALIGN(offset + size);
23b0b2b2
LP
382 if (new_size < le64toh(f->header->header_size))
383 new_size = le64toh(f->header->header_size);
bc85bfee 384
2678031a
LP
385 if (new_size <= old_size) {
386
387 /* We already pre-allocated enough space, but before
388 * we write to it, let's check with fstat() if the
389 * file got deleted, in order make sure we don't throw
390 * away the data immediately. Don't check fstat() for
391 * all writes though, but only once ever 10s. */
392
393 if (f->last_stat_usec + LAST_STAT_REFRESH_USEC > now(CLOCK_MONOTONIC))
394 return 0;
395
396 return journal_file_fstat(f);
397 }
398
399 /* Allocate more space. */
cec736d2 400
a676e665 401 if (f->metrics.max_size > 0 && new_size > f->metrics.max_size)
bc85bfee 402 return -E2BIG;
cec736d2 403
a676e665 404 if (new_size > f->metrics.min_size && f->metrics.keep_free > 0) {
cec736d2
LP
405 struct statvfs svfs;
406
407 if (fstatvfs(f->fd, &svfs) >= 0) {
408 uint64_t available;
409
070052ab 410 available = LESS_BY((uint64_t) svfs.f_bfree * (uint64_t) svfs.f_bsize, f->metrics.keep_free);
cec736d2
LP
411
412 if (new_size - old_size > available)
413 return -E2BIG;
414 }
415 }
416
eda4b58b
LP
417 /* Increase by larger blocks at once */
418 new_size = ((new_size+FILE_SIZE_INCREASE-1) / FILE_SIZE_INCREASE) * FILE_SIZE_INCREASE;
419 if (f->metrics.max_size > 0 && new_size > f->metrics.max_size)
420 new_size = f->metrics.max_size;
421
bc85bfee
LP
422 /* Note that the glibc fallocate() fallback is very
423 inefficient, hence we try to minimize the allocation area
424 as we can. */
fec2aa2f
GV
425 r = posix_fallocate(f->fd, old_size, new_size - old_size);
426 if (r != 0)
427 return -r;
cec736d2 428
23b0b2b2 429 f->header->arena_size = htole64(new_size - le64toh(f->header->header_size));
cec736d2 430
2678031a 431 return journal_file_fstat(f);
cec736d2
LP
432}
433
78519831 434static unsigned type_to_context(ObjectType type) {
d3d3208f 435 /* One context for each type, plus one catch-all for the rest */
69adae51 436 assert_cc(_OBJECT_TYPE_MAX <= MMAP_CACHE_MAX_CONTEXTS);
fa6ac760 437 assert_cc(CONTEXT_HEADER < MMAP_CACHE_MAX_CONTEXTS);
d05089d8 438 return type > OBJECT_UNUSED && type < _OBJECT_TYPE_MAX ? type : 0;
d3d3208f
MS
439}
440
7a9dabea 441static int journal_file_move_to(JournalFile *f, ObjectType type, bool keep_always, uint64_t offset, uint64_t size, void **ret) {
2678031a
LP
442 int r;
443
cec736d2 444 assert(f);
cec736d2
LP
445 assert(ret);
446
7762e02b
LP
447 if (size <= 0)
448 return -EINVAL;
449
2a59ea54 450 /* Avoid SIGBUS on invalid accesses */
4bbdcdb3
LP
451 if (offset + size > (uint64_t) f->last_stat.st_size) {
452 /* Hmm, out of range? Let's refresh the fstat() data
453 * first, before we trust that check. */
454
2678031a
LP
455 r = journal_file_fstat(f);
456 if (r < 0)
457 return r;
458
459 if (offset + size > (uint64_t) f->last_stat.st_size)
4bbdcdb3
LP
460 return -EADDRNOTAVAIL;
461 }
462
7a9dabea 463 return mmap_cache_get(f->mmap, f->fd, f->prot, type_to_context(type), keep_always, offset, size, &f->last_stat, ret);
cec736d2
LP
464}
465
16e9f408
LP
466static uint64_t minimum_header_size(Object *o) {
467
b8e891e6 468 static const uint64_t table[] = {
16e9f408
LP
469 [OBJECT_DATA] = sizeof(DataObject),
470 [OBJECT_FIELD] = sizeof(FieldObject),
471 [OBJECT_ENTRY] = sizeof(EntryObject),
472 [OBJECT_DATA_HASH_TABLE] = sizeof(HashTableObject),
473 [OBJECT_FIELD_HASH_TABLE] = sizeof(HashTableObject),
474 [OBJECT_ENTRY_ARRAY] = sizeof(EntryArrayObject),
475 [OBJECT_TAG] = sizeof(TagObject),
476 };
477
478 if (o->object.type >= ELEMENTSOF(table) || table[o->object.type] <= 0)
479 return sizeof(ObjectHeader);
480
481 return table[o->object.type];
482}
483
78519831 484int journal_file_move_to_object(JournalFile *f, ObjectType type, uint64_t offset, Object **ret) {
cec736d2
LP
485 int r;
486 void *t;
487 Object *o;
488 uint64_t s;
489
490 assert(f);
491 assert(ret);
492
db11ac1a
LP
493 /* Objects may only be located at multiple of 64 bit */
494 if (!VALID64(offset))
495 return -EFAULT;
496
7a9dabea 497 r = journal_file_move_to(f, type, false, offset, sizeof(ObjectHeader), &t);
cec736d2
LP
498 if (r < 0)
499 return r;
500
501 o = (Object*) t;
502 s = le64toh(o->object.size);
503
504 if (s < sizeof(ObjectHeader))
505 return -EBADMSG;
506
16e9f408
LP
507 if (o->object.type <= OBJECT_UNUSED)
508 return -EBADMSG;
509
510 if (s < minimum_header_size(o))
511 return -EBADMSG;
512
d05089d8 513 if (type > OBJECT_UNUSED && o->object.type != type)
cec736d2
LP
514 return -EBADMSG;
515
516 if (s > sizeof(ObjectHeader)) {
7a9dabea 517 r = journal_file_move_to(f, type, false, offset, s, &t);
cec736d2
LP
518 if (r < 0)
519 return r;
520
521 o = (Object*) t;
522 }
523
cec736d2
LP
524 *ret = o;
525 return 0;
526}
527
d98cc1f2 528static uint64_t journal_file_entry_seqnum(JournalFile *f, uint64_t *seqnum) {
cec736d2
LP
529 uint64_t r;
530
531 assert(f);
532
beec0085 533 r = le64toh(f->header->tail_entry_seqnum) + 1;
c2373f84
LP
534
535 if (seqnum) {
de190aef 536 /* If an external seqnum counter was passed, we update
c2373f84
LP
537 * both the local and the external one, and set it to
538 * the maximum of both */
539
540 if (*seqnum + 1 > r)
541 r = *seqnum + 1;
542
543 *seqnum = r;
544 }
545
beec0085 546 f->header->tail_entry_seqnum = htole64(r);
cec736d2 547
beec0085
LP
548 if (f->header->head_entry_seqnum == 0)
549 f->header->head_entry_seqnum = htole64(r);
de190aef 550
cec736d2
LP
551 return r;
552}
553
78519831 554int journal_file_append_object(JournalFile *f, ObjectType type, uint64_t size, Object **ret, uint64_t *offset) {
cec736d2
LP
555 int r;
556 uint64_t p;
557 Object *tail, *o;
558 void *t;
559
560 assert(f);
d05089d8 561 assert(type > OBJECT_UNUSED && type < _OBJECT_TYPE_MAX);
cec736d2
LP
562 assert(size >= sizeof(ObjectHeader));
563 assert(offset);
564 assert(ret);
565
26687bf8
OS
566 r = journal_file_set_online(f);
567 if (r < 0)
568 return r;
569
cec736d2 570 p = le64toh(f->header->tail_object_offset);
cec736d2 571 if (p == 0)
23b0b2b2 572 p = le64toh(f->header->header_size);
cec736d2 573 else {
d05089d8 574 r = journal_file_move_to_object(f, OBJECT_UNUSED, p, &tail);
cec736d2
LP
575 if (r < 0)
576 return r;
577
578 p += ALIGN64(le64toh(tail->object.size));
579 }
580
581 r = journal_file_allocate(f, p, size);
582 if (r < 0)
583 return r;
584
fcde2389 585 r = journal_file_move_to(f, type, false, p, size, &t);
cec736d2
LP
586 if (r < 0)
587 return r;
588
589 o = (Object*) t;
590
591 zero(o->object);
de190aef 592 o->object.type = type;
cec736d2
LP
593 o->object.size = htole64(size);
594
595 f->header->tail_object_offset = htole64(p);
cec736d2
LP
596 f->header->n_objects = htole64(le64toh(f->header->n_objects) + 1);
597
598 *ret = o;
599 *offset = p;
600
601 return 0;
602}
603
de190aef 604static int journal_file_setup_data_hash_table(JournalFile *f) {
cec736d2
LP
605 uint64_t s, p;
606 Object *o;
607 int r;
608
609 assert(f);
610
070052ab
LP
611 /* We estimate that we need 1 hash table entry per 768 bytes
612 of journal file and we want to make sure we never get
613 beyond 75% fill level. Calculate the hash table size for
614 the maximum file size based on these metrics. */
4a92baf3 615
dfabe643 616 s = (f->metrics.max_size * 4 / 768 / 3) * sizeof(HashItem);
4a92baf3
LP
617 if (s < DEFAULT_DATA_HASH_TABLE_SIZE)
618 s = DEFAULT_DATA_HASH_TABLE_SIZE;
619
507f22bd 620 log_debug("Reserving %"PRIu64" entries in hash table.", s / sizeof(HashItem));
4a92baf3 621
de190aef
LP
622 r = journal_file_append_object(f,
623 OBJECT_DATA_HASH_TABLE,
624 offsetof(Object, hash_table.items) + s,
625 &o, &p);
cec736d2
LP
626 if (r < 0)
627 return r;
628
29804cc1 629 memzero(o->hash_table.items, s);
cec736d2 630
de190aef
LP
631 f->header->data_hash_table_offset = htole64(p + offsetof(Object, hash_table.items));
632 f->header->data_hash_table_size = htole64(s);
cec736d2
LP
633
634 return 0;
635}
636
de190aef 637static int journal_file_setup_field_hash_table(JournalFile *f) {
cec736d2
LP
638 uint64_t s, p;
639 Object *o;
640 int r;
641
642 assert(f);
643
3c1668da
LP
644 /* We use a fixed size hash table for the fields as this
645 * number should grow very slowly only */
646
de190aef
LP
647 s = DEFAULT_FIELD_HASH_TABLE_SIZE;
648 r = journal_file_append_object(f,
649 OBJECT_FIELD_HASH_TABLE,
650 offsetof(Object, hash_table.items) + s,
651 &o, &p);
cec736d2
LP
652 if (r < 0)
653 return r;
654
29804cc1 655 memzero(o->hash_table.items, s);
cec736d2 656
de190aef
LP
657 f->header->field_hash_table_offset = htole64(p + offsetof(Object, hash_table.items));
658 f->header->field_hash_table_size = htole64(s);
cec736d2
LP
659
660 return 0;
661}
662
dade37d4 663int journal_file_map_data_hash_table(JournalFile *f) {
cec736d2
LP
664 uint64_t s, p;
665 void *t;
666 int r;
667
668 assert(f);
669
dade37d4
LP
670 if (f->data_hash_table)
671 return 0;
672
de190aef
LP
673 p = le64toh(f->header->data_hash_table_offset);
674 s = le64toh(f->header->data_hash_table_size);
cec736d2 675
de190aef 676 r = journal_file_move_to(f,
16e9f408 677 OBJECT_DATA_HASH_TABLE,
fcde2389 678 true,
de190aef
LP
679 p, s,
680 &t);
cec736d2
LP
681 if (r < 0)
682 return r;
683
de190aef 684 f->data_hash_table = t;
cec736d2
LP
685 return 0;
686}
687
dade37d4 688int journal_file_map_field_hash_table(JournalFile *f) {
cec736d2
LP
689 uint64_t s, p;
690 void *t;
691 int r;
692
693 assert(f);
694
dade37d4
LP
695 if (f->field_hash_table)
696 return 0;
697
de190aef
LP
698 p = le64toh(f->header->field_hash_table_offset);
699 s = le64toh(f->header->field_hash_table_size);
cec736d2 700
de190aef 701 r = journal_file_move_to(f,
16e9f408 702 OBJECT_FIELD_HASH_TABLE,
fcde2389 703 true,
de190aef
LP
704 p, s,
705 &t);
cec736d2
LP
706 if (r < 0)
707 return r;
708
de190aef 709 f->field_hash_table = t;
cec736d2
LP
710 return 0;
711}
712
3c1668da
LP
713static int journal_file_link_field(
714 JournalFile *f,
715 Object *o,
716 uint64_t offset,
717 uint64_t hash) {
718
805d1486 719 uint64_t p, h, m;
3c1668da
LP
720 int r;
721
722 assert(f);
723 assert(o);
724 assert(offset > 0);
725
726 if (o->object.type != OBJECT_FIELD)
727 return -EINVAL;
728
805d1486
LP
729 m = le64toh(f->header->field_hash_table_size) / sizeof(HashItem);
730 if (m <= 0)
731 return -EBADMSG;
3c1668da 732
805d1486 733 /* This might alter the window we are looking at */
3c1668da
LP
734 o->field.next_hash_offset = o->field.head_data_offset = 0;
735
805d1486 736 h = hash % m;
3c1668da
LP
737 p = le64toh(f->field_hash_table[h].tail_hash_offset);
738 if (p == 0)
739 f->field_hash_table[h].head_hash_offset = htole64(offset);
740 else {
741 r = journal_file_move_to_object(f, OBJECT_FIELD, p, &o);
742 if (r < 0)
743 return r;
744
745 o->field.next_hash_offset = htole64(offset);
746 }
747
748 f->field_hash_table[h].tail_hash_offset = htole64(offset);
749
750 if (JOURNAL_HEADER_CONTAINS(f->header, n_fields))
751 f->header->n_fields = htole64(le64toh(f->header->n_fields) + 1);
752
753 return 0;
754}
755
756static int journal_file_link_data(
757 JournalFile *f,
758 Object *o,
759 uint64_t offset,
760 uint64_t hash) {
761
805d1486 762 uint64_t p, h, m;
cec736d2
LP
763 int r;
764
765 assert(f);
766 assert(o);
767 assert(offset > 0);
b588975f
LP
768
769 if (o->object.type != OBJECT_DATA)
770 return -EINVAL;
cec736d2 771
805d1486
LP
772 m = le64toh(f->header->data_hash_table_size) / sizeof(HashItem);
773 if (m <= 0)
774 return -EBADMSG;
48496df6 775
805d1486 776 /* This might alter the window we are looking at */
de190aef
LP
777 o->data.next_hash_offset = o->data.next_field_offset = 0;
778 o->data.entry_offset = o->data.entry_array_offset = 0;
779 o->data.n_entries = 0;
cec736d2 780
805d1486 781 h = hash % m;
8db4213e 782 p = le64toh(f->data_hash_table[h].tail_hash_offset);
3c1668da 783 if (p == 0)
cec736d2 784 /* Only entry in the hash table is easy */
de190aef 785 f->data_hash_table[h].head_hash_offset = htole64(offset);
3c1668da 786 else {
48496df6
LP
787 /* Move back to the previous data object, to patch in
788 * pointer */
cec736d2 789
de190aef 790 r = journal_file_move_to_object(f, OBJECT_DATA, p, &o);
cec736d2
LP
791 if (r < 0)
792 return r;
793
de190aef 794 o->data.next_hash_offset = htole64(offset);
cec736d2
LP
795 }
796
de190aef 797 f->data_hash_table[h].tail_hash_offset = htole64(offset);
cec736d2 798
dca6219e
LP
799 if (JOURNAL_HEADER_CONTAINS(f->header, n_data))
800 f->header->n_data = htole64(le64toh(f->header->n_data) + 1);
801
cec736d2
LP
802 return 0;
803}
804
3c1668da
LP
805int journal_file_find_field_object_with_hash(
806 JournalFile *f,
807 const void *field, uint64_t size, uint64_t hash,
808 Object **ret, uint64_t *offset) {
809
805d1486 810 uint64_t p, osize, h, m;
3c1668da
LP
811 int r;
812
813 assert(f);
814 assert(field && size > 0);
815
dade37d4
LP
816 /* If the field hash table is empty, we can't find anything */
817 if (le64toh(f->header->field_hash_table_size) <= 0)
818 return 0;
819
820 /* Map the field hash table, if it isn't mapped yet. */
821 r = journal_file_map_field_hash_table(f);
822 if (r < 0)
823 return r;
824
3c1668da
LP
825 osize = offsetof(Object, field.payload) + size;
826
805d1486 827 m = le64toh(f->header->field_hash_table_size) / sizeof(HashItem);
805d1486 828 if (m <= 0)
3c1668da
LP
829 return -EBADMSG;
830
805d1486 831 h = hash % m;
3c1668da
LP
832 p = le64toh(f->field_hash_table[h].head_hash_offset);
833
834 while (p > 0) {
835 Object *o;
836
837 r = journal_file_move_to_object(f, OBJECT_FIELD, p, &o);
838 if (r < 0)
839 return r;
840
841 if (le64toh(o->field.hash) == hash &&
842 le64toh(o->object.size) == osize &&
843 memcmp(o->field.payload, field, size) == 0) {
844
845 if (ret)
846 *ret = o;
847 if (offset)
848 *offset = p;
849
850 return 1;
851 }
852
853 p = le64toh(o->field.next_hash_offset);
854 }
855
856 return 0;
857}
858
859int journal_file_find_field_object(
860 JournalFile *f,
861 const void *field, uint64_t size,
862 Object **ret, uint64_t *offset) {
863
864 uint64_t hash;
865
866 assert(f);
867 assert(field && size > 0);
868
869 hash = hash64(field, size);
870
871 return journal_file_find_field_object_with_hash(f,
872 field, size, hash,
873 ret, offset);
874}
875
de190aef
LP
876int journal_file_find_data_object_with_hash(
877 JournalFile *f,
878 const void *data, uint64_t size, uint64_t hash,
879 Object **ret, uint64_t *offset) {
48496df6 880
805d1486 881 uint64_t p, osize, h, m;
cec736d2
LP
882 int r;
883
884 assert(f);
885 assert(data || size == 0);
886
dade37d4
LP
887 /* If there's no data hash table, then there's no entry. */
888 if (le64toh(f->header->data_hash_table_size) <= 0)
889 return 0;
890
891 /* Map the data hash table, if it isn't mapped yet. */
892 r = journal_file_map_data_hash_table(f);
893 if (r < 0)
894 return r;
895
cec736d2
LP
896 osize = offsetof(Object, data.payload) + size;
897
805d1486
LP
898 m = le64toh(f->header->data_hash_table_size) / sizeof(HashItem);
899 if (m <= 0)
bc85bfee
LP
900 return -EBADMSG;
901
805d1486 902 h = hash % m;
de190aef 903 p = le64toh(f->data_hash_table[h].head_hash_offset);
cec736d2 904
de190aef
LP
905 while (p > 0) {
906 Object *o;
cec736d2 907
de190aef 908 r = journal_file_move_to_object(f, OBJECT_DATA, p, &o);
cec736d2
LP
909 if (r < 0)
910 return r;
911
807e17f0 912 if (le64toh(o->data.hash) != hash)
85a131e8 913 goto next;
807e17f0 914
d89c8fdf 915 if (o->object.flags & OBJECT_COMPRESSION_MASK) {
3b1a55e1 916#if defined(HAVE_XZ) || defined(HAVE_LZ4)
fa1c4b51 917 uint64_t l;
a7f7d1bd 918 size_t rsize = 0;
cec736d2 919
807e17f0
LP
920 l = le64toh(o->object.size);
921 if (l <= offsetof(Object, data.payload))
cec736d2
LP
922 return -EBADMSG;
923
807e17f0
LP
924 l -= offsetof(Object, data.payload);
925
d89c8fdf
ZJS
926 r = decompress_blob(o->object.flags & OBJECT_COMPRESSION_MASK,
927 o->data.payload, l, &f->compress_buffer, &f->compress_buffer_size, &rsize, 0);
928 if (r < 0)
929 return r;
807e17f0 930
b785c858 931 if (rsize == size &&
807e17f0
LP
932 memcmp(f->compress_buffer, data, size) == 0) {
933
934 if (ret)
935 *ret = o;
936
937 if (offset)
938 *offset = p;
939
940 return 1;
941 }
3b1a55e1
ZJS
942#else
943 return -EPROTONOSUPPORT;
944#endif
807e17f0
LP
945 } else if (le64toh(o->object.size) == osize &&
946 memcmp(o->data.payload, data, size) == 0) {
947
cec736d2
LP
948 if (ret)
949 *ret = o;
950
951 if (offset)
952 *offset = p;
953
de190aef 954 return 1;
cec736d2
LP
955 }
956
85a131e8 957 next:
cec736d2
LP
958 p = le64toh(o->data.next_hash_offset);
959 }
960
de190aef
LP
961 return 0;
962}
963
964int journal_file_find_data_object(
965 JournalFile *f,
966 const void *data, uint64_t size,
967 Object **ret, uint64_t *offset) {
968
969 uint64_t hash;
970
971 assert(f);
972 assert(data || size == 0);
973
974 hash = hash64(data, size);
975
976 return journal_file_find_data_object_with_hash(f,
977 data, size, hash,
978 ret, offset);
979}
980
3c1668da
LP
981static int journal_file_append_field(
982 JournalFile *f,
983 const void *field, uint64_t size,
984 Object **ret, uint64_t *offset) {
985
986 uint64_t hash, p;
987 uint64_t osize;
988 Object *o;
989 int r;
990
991 assert(f);
992 assert(field && size > 0);
993
994 hash = hash64(field, size);
995
996 r = journal_file_find_field_object_with_hash(f, field, size, hash, &o, &p);
997 if (r < 0)
998 return r;
999 else if (r > 0) {
1000
1001 if (ret)
1002 *ret = o;
1003
1004 if (offset)
1005 *offset = p;
1006
1007 return 0;
1008 }
1009
1010 osize = offsetof(Object, field.payload) + size;
1011 r = journal_file_append_object(f, OBJECT_FIELD, osize, &o, &p);
8c92d4bb
LP
1012 if (r < 0)
1013 return r;
3c1668da
LP
1014
1015 o->field.hash = htole64(hash);
1016 memcpy(o->field.payload, field, size);
1017
1018 r = journal_file_link_field(f, o, p, hash);
1019 if (r < 0)
1020 return r;
1021
1022 /* The linking might have altered the window, so let's
1023 * refresh our pointer */
1024 r = journal_file_move_to_object(f, OBJECT_FIELD, p, &o);
1025 if (r < 0)
1026 return r;
1027
1028#ifdef HAVE_GCRYPT
1029 r = journal_file_hmac_put_object(f, OBJECT_FIELD, o, p);
1030 if (r < 0)
1031 return r;
1032#endif
1033
1034 if (ret)
1035 *ret = o;
1036
1037 if (offset)
1038 *offset = p;
1039
1040 return 0;
1041}
1042
48496df6
LP
1043static int journal_file_append_data(
1044 JournalFile *f,
1045 const void *data, uint64_t size,
1046 Object **ret, uint64_t *offset) {
1047
de190aef
LP
1048 uint64_t hash, p;
1049 uint64_t osize;
1050 Object *o;
d89c8fdf 1051 int r, compression = 0;
3c1668da 1052 const void *eq;
de190aef
LP
1053
1054 assert(f);
1055 assert(data || size == 0);
1056
1057 hash = hash64(data, size);
1058
1059 r = journal_file_find_data_object_with_hash(f, data, size, hash, &o, &p);
1060 if (r < 0)
1061 return r;
1062 else if (r > 0) {
1063
1064 if (ret)
1065 *ret = o;
1066
1067 if (offset)
1068 *offset = p;
1069
1070 return 0;
1071 }
1072
1073 osize = offsetof(Object, data.payload) + size;
1074 r = journal_file_append_object(f, OBJECT_DATA, osize, &o, &p);
cec736d2
LP
1075 if (r < 0)
1076 return r;
1077
cec736d2 1078 o->data.hash = htole64(hash);
807e17f0 1079
d89c8fdf
ZJS
1080#if defined(HAVE_XZ) || defined(HAVE_LZ4)
1081 if (f->compress_xz &&
807e17f0 1082 size >= COMPRESSION_SIZE_THRESHOLD) {
a7f7d1bd 1083 size_t rsize = 0;
807e17f0 1084
d89c8fdf 1085 compression = compress_blob(data, size, o->data.payload, &rsize);
807e17f0 1086
d89c8fdf 1087 if (compression) {
807e17f0 1088 o->object.size = htole64(offsetof(Object, data.payload) + rsize);
d89c8fdf 1089 o->object.flags |= compression;
807e17f0 1090
fa1c4b51 1091 log_debug("Compressed data object %"PRIu64" -> %zu using %s",
d89c8fdf 1092 size, rsize, object_compressed_to_string(compression));
807e17f0
LP
1093 }
1094 }
1095#endif
1096
d89c8fdf 1097 if (!compression && size > 0)
807e17f0 1098 memcpy(o->data.payload, data, size);
cec736d2 1099
de190aef 1100 r = journal_file_link_data(f, o, p, hash);
cec736d2
LP
1101 if (r < 0)
1102 return r;
1103
48496df6
LP
1104 /* The linking might have altered the window, so let's
1105 * refresh our pointer */
1106 r = journal_file_move_to_object(f, OBJECT_DATA, p, &o);
1107 if (r < 0)
1108 return r;
1109
08c6f819
SL
1110 if (!data)
1111 eq = NULL;
1112 else
1113 eq = memchr(data, '=', size);
3c1668da 1114 if (eq && eq > data) {
748db592 1115 Object *fo = NULL;
3c1668da 1116 uint64_t fp;
3c1668da
LP
1117
1118 /* Create field object ... */
1119 r = journal_file_append_field(f, data, (uint8_t*) eq - (uint8_t*) data, &fo, &fp);
1120 if (r < 0)
1121 return r;
1122
1123 /* ... and link it in. */
1124 o->data.next_field_offset = fo->field.head_data_offset;
1125 fo->field.head_data_offset = le64toh(p);
1126 }
1127
5996c7c2
LP
1128#ifdef HAVE_GCRYPT
1129 r = journal_file_hmac_put_object(f, OBJECT_DATA, o, p);
1130 if (r < 0)
1131 return r;
1132#endif
1133
cec736d2
LP
1134 if (ret)
1135 *ret = o;
1136
1137 if (offset)
de190aef 1138 *offset = p;
cec736d2
LP
1139
1140 return 0;
1141}
1142
1143uint64_t journal_file_entry_n_items(Object *o) {
1144 assert(o);
b588975f
LP
1145
1146 if (o->object.type != OBJECT_ENTRY)
1147 return 0;
cec736d2
LP
1148
1149 return (le64toh(o->object.size) - offsetof(Object, entry.items)) / sizeof(EntryItem);
1150}
1151
0284adc6 1152uint64_t journal_file_entry_array_n_items(Object *o) {
de190aef 1153 assert(o);
b588975f
LP
1154
1155 if (o->object.type != OBJECT_ENTRY_ARRAY)
1156 return 0;
de190aef
LP
1157
1158 return (le64toh(o->object.size) - offsetof(Object, entry_array.items)) / sizeof(uint64_t);
1159}
1160
fb9a24b6
LP
1161uint64_t journal_file_hash_table_n_items(Object *o) {
1162 assert(o);
b588975f
LP
1163
1164 if (o->object.type != OBJECT_DATA_HASH_TABLE &&
1165 o->object.type != OBJECT_FIELD_HASH_TABLE)
1166 return 0;
fb9a24b6
LP
1167
1168 return (le64toh(o->object.size) - offsetof(Object, hash_table.items)) / sizeof(HashItem);
1169}
1170
de190aef 1171static int link_entry_into_array(JournalFile *f,
4fd052ae
FC
1172 le64_t *first,
1173 le64_t *idx,
de190aef 1174 uint64_t p) {
cec736d2 1175 int r;
de190aef
LP
1176 uint64_t n = 0, ap = 0, q, i, a, hidx;
1177 Object *o;
1178
cec736d2 1179 assert(f);
de190aef
LP
1180 assert(first);
1181 assert(idx);
1182 assert(p > 0);
cec736d2 1183
de190aef
LP
1184 a = le64toh(*first);
1185 i = hidx = le64toh(*idx);
1186 while (a > 0) {
1187
1188 r = journal_file_move_to_object(f, OBJECT_ENTRY_ARRAY, a, &o);
1189 if (r < 0)
1190 return r;
cec736d2 1191
de190aef
LP
1192 n = journal_file_entry_array_n_items(o);
1193 if (i < n) {
1194 o->entry_array.items[i] = htole64(p);
1195 *idx = htole64(hidx + 1);
1196 return 0;
1197 }
cec736d2 1198
de190aef
LP
1199 i -= n;
1200 ap = a;
1201 a = le64toh(o->entry_array.next_entry_array_offset);
1202 }
1203
1204 if (hidx > n)
1205 n = (hidx+1) * 2;
1206 else
1207 n = n * 2;
1208
1209 if (n < 4)
1210 n = 4;
1211
1212 r = journal_file_append_object(f, OBJECT_ENTRY_ARRAY,
1213 offsetof(Object, entry_array.items) + n * sizeof(uint64_t),
1214 &o, &q);
cec736d2
LP
1215 if (r < 0)
1216 return r;
1217
feb12d3e 1218#ifdef HAVE_GCRYPT
5996c7c2 1219 r = journal_file_hmac_put_object(f, OBJECT_ENTRY_ARRAY, o, q);
b0af6f41
LP
1220 if (r < 0)
1221 return r;
feb12d3e 1222#endif
b0af6f41 1223
de190aef 1224 o->entry_array.items[i] = htole64(p);
cec736d2 1225
de190aef 1226 if (ap == 0)
7be3aa17 1227 *first = htole64(q);
cec736d2 1228 else {
de190aef 1229 r = journal_file_move_to_object(f, OBJECT_ENTRY_ARRAY, ap, &o);
cec736d2
LP
1230 if (r < 0)
1231 return r;
1232
de190aef
LP
1233 o->entry_array.next_entry_array_offset = htole64(q);
1234 }
cec736d2 1235
2dee23eb
LP
1236 if (JOURNAL_HEADER_CONTAINS(f->header, n_entry_arrays))
1237 f->header->n_entry_arrays = htole64(le64toh(f->header->n_entry_arrays) + 1);
1238
de190aef
LP
1239 *idx = htole64(hidx + 1);
1240
1241 return 0;
1242}
cec736d2 1243
de190aef 1244static int link_entry_into_array_plus_one(JournalFile *f,
4fd052ae
FC
1245 le64_t *extra,
1246 le64_t *first,
1247 le64_t *idx,
de190aef
LP
1248 uint64_t p) {
1249
1250 int r;
1251
1252 assert(f);
1253 assert(extra);
1254 assert(first);
1255 assert(idx);
1256 assert(p > 0);
1257
1258 if (*idx == 0)
1259 *extra = htole64(p);
1260 else {
4fd052ae 1261 le64_t i;
de190aef 1262
7be3aa17 1263 i = htole64(le64toh(*idx) - 1);
de190aef
LP
1264 r = link_entry_into_array(f, first, &i, p);
1265 if (r < 0)
1266 return r;
cec736d2
LP
1267 }
1268
de190aef
LP
1269 *idx = htole64(le64toh(*idx) + 1);
1270 return 0;
1271}
1272
1273static int journal_file_link_entry_item(JournalFile *f, Object *o, uint64_t offset, uint64_t i) {
1274 uint64_t p;
1275 int r;
1276 assert(f);
1277 assert(o);
1278 assert(offset > 0);
1279
1280 p = le64toh(o->entry.items[i].object_offset);
1281 if (p == 0)
1282 return -EINVAL;
1283
1284 r = journal_file_move_to_object(f, OBJECT_DATA, p, &o);
cec736d2
LP
1285 if (r < 0)
1286 return r;
1287
de190aef
LP
1288 return link_entry_into_array_plus_one(f,
1289 &o->data.entry_offset,
1290 &o->data.entry_array_offset,
1291 &o->data.n_entries,
1292 offset);
cec736d2
LP
1293}
1294
1295static int journal_file_link_entry(JournalFile *f, Object *o, uint64_t offset) {
de190aef 1296 uint64_t n, i;
cec736d2
LP
1297 int r;
1298
1299 assert(f);
1300 assert(o);
1301 assert(offset > 0);
b588975f
LP
1302
1303 if (o->object.type != OBJECT_ENTRY)
1304 return -EINVAL;
cec736d2 1305
b788cc23
LP
1306 __sync_synchronize();
1307
cec736d2 1308 /* Link up the entry itself */
de190aef
LP
1309 r = link_entry_into_array(f,
1310 &f->header->entry_array_offset,
1311 &f->header->n_entries,
1312 offset);
1313 if (r < 0)
1314 return r;
cec736d2 1315
507f22bd 1316 /* log_debug("=> %s seqnr=%"PRIu64" n_entries=%"PRIu64, f->path, o->entry.seqnum, f->header->n_entries); */
cec736d2 1317
de190aef 1318 if (f->header->head_entry_realtime == 0)
0ac38b70 1319 f->header->head_entry_realtime = o->entry.realtime;
cec736d2 1320
0ac38b70 1321 f->header->tail_entry_realtime = o->entry.realtime;
de190aef
LP
1322 f->header->tail_entry_monotonic = o->entry.monotonic;
1323
1324 f->tail_entry_monotonic_valid = true;
cec736d2
LP
1325
1326 /* Link up the items */
1327 n = journal_file_entry_n_items(o);
1328 for (i = 0; i < n; i++) {
1329 r = journal_file_link_entry_item(f, o, offset, i);
1330 if (r < 0)
1331 return r;
1332 }
1333
cec736d2
LP
1334 return 0;
1335}
1336
1337static int journal_file_append_entry_internal(
1338 JournalFile *f,
1339 const dual_timestamp *ts,
1340 uint64_t xor_hash,
1341 const EntryItem items[], unsigned n_items,
de190aef 1342 uint64_t *seqnum,
cec736d2
LP
1343 Object **ret, uint64_t *offset) {
1344 uint64_t np;
1345 uint64_t osize;
1346 Object *o;
1347 int r;
1348
1349 assert(f);
1350 assert(items || n_items == 0);
de190aef 1351 assert(ts);
cec736d2
LP
1352
1353 osize = offsetof(Object, entry.items) + (n_items * sizeof(EntryItem));
1354
de190aef 1355 r = journal_file_append_object(f, OBJECT_ENTRY, osize, &o, &np);
cec736d2
LP
1356 if (r < 0)
1357 return r;
1358
d98cc1f2 1359 o->entry.seqnum = htole64(journal_file_entry_seqnum(f, seqnum));
cec736d2 1360 memcpy(o->entry.items, items, n_items * sizeof(EntryItem));
de190aef
LP
1361 o->entry.realtime = htole64(ts->realtime);
1362 o->entry.monotonic = htole64(ts->monotonic);
cec736d2
LP
1363 o->entry.xor_hash = htole64(xor_hash);
1364 o->entry.boot_id = f->header->boot_id;
1365
feb12d3e 1366#ifdef HAVE_GCRYPT
5996c7c2 1367 r = journal_file_hmac_put_object(f, OBJECT_ENTRY, o, np);
b0af6f41
LP
1368 if (r < 0)
1369 return r;
feb12d3e 1370#endif
b0af6f41 1371
cec736d2
LP
1372 r = journal_file_link_entry(f, o, np);
1373 if (r < 0)
1374 return r;
1375
1376 if (ret)
1377 *ret = o;
1378
1379 if (offset)
1380 *offset = np;
1381
1382 return 0;
1383}
1384
cf244689 1385void journal_file_post_change(JournalFile *f) {
50f20cfd
LP
1386 assert(f);
1387
1388 /* inotify() does not receive IN_MODIFY events from file
1389 * accesses done via mmap(). After each access we hence
1390 * trigger IN_MODIFY by truncating the journal file to its
1391 * current size which triggers IN_MODIFY. */
1392
bc85bfee
LP
1393 __sync_synchronize();
1394
50f20cfd 1395 if (ftruncate(f->fd, f->last_stat.st_size) < 0)
56f64d95 1396 log_error_errno(errno, "Failed to truncate file to its own size: %m");
50f20cfd
LP
1397}
1398
1f2da9ec
LP
1399static int entry_item_cmp(const void *_a, const void *_b) {
1400 const EntryItem *a = _a, *b = _b;
1401
1402 if (le64toh(a->object_offset) < le64toh(b->object_offset))
1403 return -1;
1404 if (le64toh(a->object_offset) > le64toh(b->object_offset))
1405 return 1;
1406 return 0;
1407}
1408
de190aef 1409int journal_file_append_entry(JournalFile *f, const dual_timestamp *ts, const struct iovec iovec[], unsigned n_iovec, uint64_t *seqnum, Object **ret, uint64_t *offset) {
cec736d2
LP
1410 unsigned i;
1411 EntryItem *items;
1412 int r;
1413 uint64_t xor_hash = 0;
de190aef 1414 struct dual_timestamp _ts;
cec736d2
LP
1415
1416 assert(f);
1417 assert(iovec || n_iovec == 0);
1418
de190aef
LP
1419 if (!ts) {
1420 dual_timestamp_get(&_ts);
1421 ts = &_ts;
1422 }
1423
1424 if (f->tail_entry_monotonic_valid &&
1425 ts->monotonic < le64toh(f->header->tail_entry_monotonic))
1426 return -EINVAL;
1427
feb12d3e 1428#ifdef HAVE_GCRYPT
7560fffc
LP
1429 r = journal_file_maybe_append_tag(f, ts->realtime);
1430 if (r < 0)
1431 return r;
feb12d3e 1432#endif
7560fffc 1433
64825d3c 1434 /* alloca() can't take 0, hence let's allocate at least one */
9607d947 1435 items = alloca(sizeof(EntryItem) * MAX(1u, n_iovec));
cec736d2
LP
1436
1437 for (i = 0; i < n_iovec; i++) {
1438 uint64_t p;
1439 Object *o;
1440
1441 r = journal_file_append_data(f, iovec[i].iov_base, iovec[i].iov_len, &o, &p);
1442 if (r < 0)
cf244689 1443 return r;
cec736d2
LP
1444
1445 xor_hash ^= le64toh(o->data.hash);
1446 items[i].object_offset = htole64(p);
de7b95cd 1447 items[i].hash = o->data.hash;
cec736d2
LP
1448 }
1449
1f2da9ec
LP
1450 /* Order by the position on disk, in order to improve seek
1451 * times for rotating media. */
7ff7394d 1452 qsort_safe(items, n_iovec, sizeof(EntryItem), entry_item_cmp);
1f2da9ec 1453
de190aef 1454 r = journal_file_append_entry_internal(f, ts, xor_hash, items, n_iovec, seqnum, ret, offset);
cec736d2 1455
fa6ac760
LP
1456 /* If the memory mapping triggered a SIGBUS then we return an
1457 * IO error and ignore the error code passed down to us, since
1458 * it is very likely just an effect of a nullified replacement
1459 * mapping page */
1460
1461 if (mmap_cache_got_sigbus(f->mmap, f->fd))
1462 r = -EIO;
1463
50f20cfd
LP
1464 journal_file_post_change(f);
1465
cec736d2
LP
1466 return r;
1467}
1468
a4bcff5b 1469typedef struct ChainCacheItem {
fb099c8d 1470 uint64_t first; /* the array at the beginning of the chain */
a4bcff5b
LP
1471 uint64_t array; /* the cached array */
1472 uint64_t begin; /* the first item in the cached array */
1473 uint64_t total; /* the total number of items in all arrays before this one in the chain */
f268980d 1474 uint64_t last_index; /* the last index we looked at, to optimize locality when bisecting */
a4bcff5b
LP
1475} ChainCacheItem;
1476
1477static void chain_cache_put(
4743015d 1478 OrderedHashmap *h,
a4bcff5b
LP
1479 ChainCacheItem *ci,
1480 uint64_t first,
1481 uint64_t array,
1482 uint64_t begin,
f268980d
LP
1483 uint64_t total,
1484 uint64_t last_index) {
a4bcff5b
LP
1485
1486 if (!ci) {
34741aa3
LP
1487 /* If the chain item to cache for this chain is the
1488 * first one it's not worth caching anything */
1489 if (array == first)
1490 return;
1491
29433089 1492 if (ordered_hashmap_size(h) >= CHAIN_CACHE_MAX) {
4743015d 1493 ci = ordered_hashmap_steal_first(h);
29433089
LP
1494 assert(ci);
1495 } else {
a4bcff5b
LP
1496 ci = new(ChainCacheItem, 1);
1497 if (!ci)
1498 return;
1499 }
1500
1501 ci->first = first;
1502
4743015d 1503 if (ordered_hashmap_put(h, &ci->first, ci) < 0) {
a4bcff5b
LP
1504 free(ci);
1505 return;
1506 }
1507 } else
1508 assert(ci->first == first);
1509
1510 ci->array = array;
1511 ci->begin = begin;
1512 ci->total = total;
f268980d 1513 ci->last_index = last_index;
a4bcff5b
LP
1514}
1515
f268980d
LP
1516static int generic_array_get(
1517 JournalFile *f,
1518 uint64_t first,
1519 uint64_t i,
1520 Object **ret, uint64_t *offset) {
de190aef 1521
cec736d2 1522 Object *o;
a4bcff5b 1523 uint64_t p = 0, a, t = 0;
cec736d2 1524 int r;
a4bcff5b 1525 ChainCacheItem *ci;
cec736d2
LP
1526
1527 assert(f);
1528
de190aef 1529 a = first;
a4bcff5b
LP
1530
1531 /* Try the chain cache first */
4743015d 1532 ci = ordered_hashmap_get(f->chain_cache, &first);
a4bcff5b
LP
1533 if (ci && i > ci->total) {
1534 a = ci->array;
1535 i -= ci->total;
1536 t = ci->total;
1537 }
1538
de190aef 1539 while (a > 0) {
a4bcff5b 1540 uint64_t k;
cec736d2 1541
de190aef
LP
1542 r = journal_file_move_to_object(f, OBJECT_ENTRY_ARRAY, a, &o);
1543 if (r < 0)
1544 return r;
cec736d2 1545
a4bcff5b
LP
1546 k = journal_file_entry_array_n_items(o);
1547 if (i < k) {
de190aef 1548 p = le64toh(o->entry_array.items[i]);
a4bcff5b 1549 goto found;
cec736d2
LP
1550 }
1551
a4bcff5b
LP
1552 i -= k;
1553 t += k;
de190aef
LP
1554 a = le64toh(o->entry_array.next_entry_array_offset);
1555 }
1556
a4bcff5b
LP
1557 return 0;
1558
1559found:
1560 /* Let's cache this item for the next invocation */
af13a6b0 1561 chain_cache_put(f->chain_cache, ci, first, a, le64toh(o->entry_array.items[0]), t, i);
de190aef
LP
1562
1563 r = journal_file_move_to_object(f, OBJECT_ENTRY, p, &o);
1564 if (r < 0)
1565 return r;
1566
1567 if (ret)
1568 *ret = o;
1569
1570 if (offset)
1571 *offset = p;
1572
1573 return 1;
1574}
1575
f268980d
LP
1576static int generic_array_get_plus_one(
1577 JournalFile *f,
1578 uint64_t extra,
1579 uint64_t first,
1580 uint64_t i,
1581 Object **ret, uint64_t *offset) {
de190aef
LP
1582
1583 Object *o;
1584
1585 assert(f);
1586
1587 if (i == 0) {
1588 int r;
1589
1590 r = journal_file_move_to_object(f, OBJECT_ENTRY, extra, &o);
cec736d2
LP
1591 if (r < 0)
1592 return r;
1593
de190aef
LP
1594 if (ret)
1595 *ret = o;
cec736d2 1596
de190aef
LP
1597 if (offset)
1598 *offset = extra;
cec736d2 1599
de190aef 1600 return 1;
cec736d2
LP
1601 }
1602
de190aef
LP
1603 return generic_array_get(f, first, i-1, ret, offset);
1604}
cec736d2 1605
de190aef
LP
1606enum {
1607 TEST_FOUND,
1608 TEST_LEFT,
1609 TEST_RIGHT
1610};
cec736d2 1611
f268980d
LP
1612static int generic_array_bisect(
1613 JournalFile *f,
1614 uint64_t first,
1615 uint64_t n,
1616 uint64_t needle,
1617 int (*test_object)(JournalFile *f, uint64_t p, uint64_t needle),
1618 direction_t direction,
1619 Object **ret,
1620 uint64_t *offset,
1621 uint64_t *idx) {
1622
1623 uint64_t a, p, t = 0, i = 0, last_p = 0, last_index = (uint64_t) -1;
de190aef
LP
1624 bool subtract_one = false;
1625 Object *o, *array = NULL;
1626 int r;
a4bcff5b 1627 ChainCacheItem *ci;
cec736d2 1628
de190aef
LP
1629 assert(f);
1630 assert(test_object);
cec736d2 1631
a4bcff5b 1632 /* Start with the first array in the chain */
de190aef 1633 a = first;
a4bcff5b 1634
4743015d 1635 ci = ordered_hashmap_get(f->chain_cache, &first);
a4bcff5b
LP
1636 if (ci && n > ci->total) {
1637 /* Ah, we have iterated this bisection array chain
1638 * previously! Let's see if we can skip ahead in the
1639 * chain, as far as the last time. But we can't jump
1640 * backwards in the chain, so let's check that
1641 * first. */
1642
1643 r = test_object(f, ci->begin, needle);
1644 if (r < 0)
1645 return r;
1646
1647 if (r == TEST_LEFT) {
f268980d 1648 /* OK, what we are looking for is right of the
a4bcff5b
LP
1649 * begin of this EntryArray, so let's jump
1650 * straight to previously cached array in the
1651 * chain */
1652
1653 a = ci->array;
1654 n -= ci->total;
1655 t = ci->total;
f268980d 1656 last_index = ci->last_index;
a4bcff5b
LP
1657 }
1658 }
1659
de190aef
LP
1660 while (a > 0) {
1661 uint64_t left, right, k, lp;
1662
1663 r = journal_file_move_to_object(f, OBJECT_ENTRY_ARRAY, a, &array);
cec736d2
LP
1664 if (r < 0)
1665 return r;
1666
de190aef
LP
1667 k = journal_file_entry_array_n_items(array);
1668 right = MIN(k, n);
1669 if (right <= 0)
1670 return 0;
cec736d2 1671
de190aef
LP
1672 i = right - 1;
1673 lp = p = le64toh(array->entry_array.items[i]);
1674 if (p <= 0)
1675 return -EBADMSG;
cec736d2 1676
de190aef
LP
1677 r = test_object(f, p, needle);
1678 if (r < 0)
1679 return r;
cec736d2 1680
de190aef
LP
1681 if (r == TEST_FOUND)
1682 r = direction == DIRECTION_DOWN ? TEST_RIGHT : TEST_LEFT;
1683
1684 if (r == TEST_RIGHT) {
1685 left = 0;
1686 right -= 1;
f268980d
LP
1687
1688 if (last_index != (uint64_t) -1) {
1689 assert(last_index <= right);
1690
1691 /* If we cached the last index we
1692 * looked at, let's try to not to jump
1693 * too wildly around and see if we can
1694 * limit the range to look at early to
1695 * the immediate neighbors of the last
1696 * index we looked at. */
1697
1698 if (last_index > 0) {
1699 uint64_t x = last_index - 1;
1700
1701 p = le64toh(array->entry_array.items[x]);
1702 if (p <= 0)
1703 return -EBADMSG;
1704
1705 r = test_object(f, p, needle);
1706 if (r < 0)
1707 return r;
1708
1709 if (r == TEST_FOUND)
1710 r = direction == DIRECTION_DOWN ? TEST_RIGHT : TEST_LEFT;
1711
1712 if (r == TEST_RIGHT)
1713 right = x;
1714 else
1715 left = x + 1;
1716 }
1717
1718 if (last_index < right) {
1719 uint64_t y = last_index + 1;
1720
1721 p = le64toh(array->entry_array.items[y]);
1722 if (p <= 0)
1723 return -EBADMSG;
1724
1725 r = test_object(f, p, needle);
1726 if (r < 0)
1727 return r;
1728
1729 if (r == TEST_FOUND)
1730 r = direction == DIRECTION_DOWN ? TEST_RIGHT : TEST_LEFT;
1731
1732 if (r == TEST_RIGHT)
1733 right = y;
1734 else
1735 left = y + 1;
1736 }
f268980d
LP
1737 }
1738
de190aef
LP
1739 for (;;) {
1740 if (left == right) {
1741 if (direction == DIRECTION_UP)
1742 subtract_one = true;
1743
1744 i = left;
1745 goto found;
1746 }
1747
1748 assert(left < right);
de190aef 1749 i = (left + right) / 2;
f268980d 1750
de190aef
LP
1751 p = le64toh(array->entry_array.items[i]);
1752 if (p <= 0)
1753 return -EBADMSG;
1754
1755 r = test_object(f, p, needle);
1756 if (r < 0)
1757 return r;
cec736d2 1758
de190aef
LP
1759 if (r == TEST_FOUND)
1760 r = direction == DIRECTION_DOWN ? TEST_RIGHT : TEST_LEFT;
1761
1762 if (r == TEST_RIGHT)
1763 right = i;
1764 else
1765 left = i + 1;
1766 }
1767 }
1768
2173cbf8 1769 if (k >= n) {
cbdca852
LP
1770 if (direction == DIRECTION_UP) {
1771 i = n;
1772 subtract_one = true;
1773 goto found;
1774 }
1775
cec736d2 1776 return 0;
cbdca852 1777 }
cec736d2 1778
de190aef
LP
1779 last_p = lp;
1780
1781 n -= k;
1782 t += k;
f268980d 1783 last_index = (uint64_t) -1;
de190aef 1784 a = le64toh(array->entry_array.next_entry_array_offset);
cec736d2
LP
1785 }
1786
1787 return 0;
de190aef
LP
1788
1789found:
1790 if (subtract_one && t == 0 && i == 0)
1791 return 0;
1792
a4bcff5b 1793 /* Let's cache this item for the next invocation */
af13a6b0 1794 chain_cache_put(f->chain_cache, ci, first, a, le64toh(array->entry_array.items[0]), t, subtract_one ? (i > 0 ? i-1 : (uint64_t) -1) : i);
a4bcff5b 1795
de190aef
LP
1796 if (subtract_one && i == 0)
1797 p = last_p;
1798 else if (subtract_one)
1799 p = le64toh(array->entry_array.items[i-1]);
1800 else
1801 p = le64toh(array->entry_array.items[i]);
1802
1803 r = journal_file_move_to_object(f, OBJECT_ENTRY, p, &o);
1804 if (r < 0)
1805 return r;
1806
1807 if (ret)
1808 *ret = o;
1809
1810 if (offset)
1811 *offset = p;
1812
1813 if (idx)
cbdca852 1814 *idx = t + i + (subtract_one ? -1 : 0);
de190aef
LP
1815
1816 return 1;
cec736d2
LP
1817}
1818
f268980d
LP
1819static int generic_array_bisect_plus_one(
1820 JournalFile *f,
1821 uint64_t extra,
1822 uint64_t first,
1823 uint64_t n,
1824 uint64_t needle,
1825 int (*test_object)(JournalFile *f, uint64_t p, uint64_t needle),
1826 direction_t direction,
1827 Object **ret,
1828 uint64_t *offset,
1829 uint64_t *idx) {
de190aef 1830
cec736d2 1831 int r;
cbdca852
LP
1832 bool step_back = false;
1833 Object *o;
cec736d2
LP
1834
1835 assert(f);
de190aef 1836 assert(test_object);
cec736d2 1837
de190aef
LP
1838 if (n <= 0)
1839 return 0;
cec736d2 1840
de190aef
LP
1841 /* This bisects the array in object 'first', but first checks
1842 * an extra */
de190aef
LP
1843 r = test_object(f, extra, needle);
1844 if (r < 0)
1845 return r;
a536e261
LP
1846
1847 if (r == TEST_FOUND)
1848 r = direction == DIRECTION_DOWN ? TEST_RIGHT : TEST_LEFT;
1849
cbdca852
LP
1850 /* if we are looking with DIRECTION_UP then we need to first
1851 see if in the actual array there is a matching entry, and
1852 return the last one of that. But if there isn't any we need
1853 to return this one. Hence remember this, and return it
1854 below. */
1855 if (r == TEST_LEFT)
1856 step_back = direction == DIRECTION_UP;
de190aef 1857
cbdca852
LP
1858 if (r == TEST_RIGHT) {
1859 if (direction == DIRECTION_DOWN)
1860 goto found;
1861 else
1862 return 0;
a536e261 1863 }
cec736d2 1864
de190aef
LP
1865 r = generic_array_bisect(f, first, n-1, needle, test_object, direction, ret, offset, idx);
1866
cbdca852
LP
1867 if (r == 0 && step_back)
1868 goto found;
1869
ecf68b1d 1870 if (r > 0 && idx)
de190aef
LP
1871 (*idx) ++;
1872
1873 return r;
cbdca852
LP
1874
1875found:
1876 r = journal_file_move_to_object(f, OBJECT_ENTRY, extra, &o);
1877 if (r < 0)
1878 return r;
1879
1880 if (ret)
1881 *ret = o;
1882
1883 if (offset)
1884 *offset = extra;
1885
1886 if (idx)
1887 *idx = 0;
1888
1889 return 1;
1890}
1891
44a6b1b6 1892_pure_ static int test_object_offset(JournalFile *f, uint64_t p, uint64_t needle) {
cbdca852
LP
1893 assert(f);
1894 assert(p > 0);
1895
1896 if (p == needle)
1897 return TEST_FOUND;
1898 else if (p < needle)
1899 return TEST_LEFT;
1900 else
1901 return TEST_RIGHT;
1902}
1903
de190aef
LP
1904static int test_object_seqnum(JournalFile *f, uint64_t p, uint64_t needle) {
1905 Object *o;
1906 int r;
1907
1908 assert(f);
1909 assert(p > 0);
1910
1911 r = journal_file_move_to_object(f, OBJECT_ENTRY, p, &o);
cec736d2
LP
1912 if (r < 0)
1913 return r;
1914
de190aef
LP
1915 if (le64toh(o->entry.seqnum) == needle)
1916 return TEST_FOUND;
1917 else if (le64toh(o->entry.seqnum) < needle)
1918 return TEST_LEFT;
1919 else
1920 return TEST_RIGHT;
1921}
cec736d2 1922
de190aef
LP
1923int journal_file_move_to_entry_by_seqnum(
1924 JournalFile *f,
1925 uint64_t seqnum,
1926 direction_t direction,
1927 Object **ret,
1928 uint64_t *offset) {
1929
1930 return generic_array_bisect(f,
1931 le64toh(f->header->entry_array_offset),
1932 le64toh(f->header->n_entries),
1933 seqnum,
1934 test_object_seqnum,
1935 direction,
1936 ret, offset, NULL);
1937}
cec736d2 1938
de190aef
LP
1939static int test_object_realtime(JournalFile *f, uint64_t p, uint64_t needle) {
1940 Object *o;
1941 int r;
1942
1943 assert(f);
1944 assert(p > 0);
1945
1946 r = journal_file_move_to_object(f, OBJECT_ENTRY, p, &o);
1947 if (r < 0)
1948 return r;
1949
1950 if (le64toh(o->entry.realtime) == needle)
1951 return TEST_FOUND;
1952 else if (le64toh(o->entry.realtime) < needle)
1953 return TEST_LEFT;
1954 else
1955 return TEST_RIGHT;
cec736d2
LP
1956}
1957
de190aef
LP
1958int journal_file_move_to_entry_by_realtime(
1959 JournalFile *f,
1960 uint64_t realtime,
1961 direction_t direction,
1962 Object **ret,
1963 uint64_t *offset) {
1964
1965 return generic_array_bisect(f,
1966 le64toh(f->header->entry_array_offset),
1967 le64toh(f->header->n_entries),
1968 realtime,
1969 test_object_realtime,
1970 direction,
1971 ret, offset, NULL);
1972}
1973
1974static int test_object_monotonic(JournalFile *f, uint64_t p, uint64_t needle) {
1975 Object *o;
1976 int r;
1977
1978 assert(f);
1979 assert(p > 0);
1980
1981 r = journal_file_move_to_object(f, OBJECT_ENTRY, p, &o);
1982 if (r < 0)
1983 return r;
1984
1985 if (le64toh(o->entry.monotonic) == needle)
1986 return TEST_FOUND;
1987 else if (le64toh(o->entry.monotonic) < needle)
1988 return TEST_LEFT;
1989 else
1990 return TEST_RIGHT;
1991}
1992
2a560338 1993static int find_data_object_by_boot_id(
47838ab3
ZJS
1994 JournalFile *f,
1995 sd_id128_t boot_id,
1996 Object **o,
1997 uint64_t *b) {
2a560338 1998
47838ab3
ZJS
1999 char t[sizeof("_BOOT_ID=")-1 + 32 + 1] = "_BOOT_ID=";
2000
2001 sd_id128_to_string(boot_id, t + 9);
2002 return journal_file_find_data_object(f, t, sizeof(t) - 1, o, b);
2003}
2004
de190aef
LP
2005int journal_file_move_to_entry_by_monotonic(
2006 JournalFile *f,
2007 sd_id128_t boot_id,
2008 uint64_t monotonic,
2009 direction_t direction,
2010 Object **ret,
2011 uint64_t *offset) {
2012
de190aef
LP
2013 Object *o;
2014 int r;
2015
cbdca852 2016 assert(f);
de190aef 2017
47838ab3 2018 r = find_data_object_by_boot_id(f, boot_id, &o, NULL);
de190aef
LP
2019 if (r < 0)
2020 return r;
cbdca852 2021 if (r == 0)
de190aef
LP
2022 return -ENOENT;
2023
2024 return generic_array_bisect_plus_one(f,
2025 le64toh(o->data.entry_offset),
2026 le64toh(o->data.entry_array_offset),
2027 le64toh(o->data.n_entries),
2028 monotonic,
2029 test_object_monotonic,
2030 direction,
2031 ret, offset, NULL);
2032}
2033
1fc605b0 2034void journal_file_reset_location(JournalFile *f) {
6573ef05 2035 f->location_type = LOCATION_HEAD;
1fc605b0 2036 f->current_offset = 0;
6573ef05
MS
2037 f->current_seqnum = 0;
2038 f->current_realtime = 0;
2039 f->current_monotonic = 0;
2040 zero(f->current_boot_id);
2041 f->current_xor_hash = 0;
2042}
2043
950c07d4 2044void journal_file_save_location(JournalFile *f, Object *o, uint64_t offset) {
6573ef05
MS
2045 f->location_type = LOCATION_SEEK;
2046 f->current_offset = offset;
2047 f->current_seqnum = le64toh(o->entry.seqnum);
2048 f->current_realtime = le64toh(o->entry.realtime);
2049 f->current_monotonic = le64toh(o->entry.monotonic);
2050 f->current_boot_id = o->entry.boot_id;
2051 f->current_xor_hash = le64toh(o->entry.xor_hash);
1fc605b0
MS
2052}
2053
d8ae66d7
MS
2054int journal_file_compare_locations(JournalFile *af, JournalFile *bf) {
2055 assert(af);
2056 assert(bf);
2057 assert(af->location_type == LOCATION_SEEK);
2058 assert(bf->location_type == LOCATION_SEEK);
2059
2060 /* If contents and timestamps match, these entries are
2061 * identical, even if the seqnum does not match */
2062 if (sd_id128_equal(af->current_boot_id, bf->current_boot_id) &&
2063 af->current_monotonic == bf->current_monotonic &&
2064 af->current_realtime == bf->current_realtime &&
2065 af->current_xor_hash == bf->current_xor_hash)
2066 return 0;
2067
2068 if (sd_id128_equal(af->header->seqnum_id, bf->header->seqnum_id)) {
2069
2070 /* If this is from the same seqnum source, compare
2071 * seqnums */
2072 if (af->current_seqnum < bf->current_seqnum)
2073 return -1;
2074 if (af->current_seqnum > bf->current_seqnum)
2075 return 1;
2076
2077 /* Wow! This is weird, different data but the same
2078 * seqnums? Something is borked, but let's make the
2079 * best of it and compare by time. */
2080 }
2081
2082 if (sd_id128_equal(af->current_boot_id, bf->current_boot_id)) {
2083
2084 /* If the boot id matches, compare monotonic time */
2085 if (af->current_monotonic < bf->current_monotonic)
2086 return -1;
2087 if (af->current_monotonic > bf->current_monotonic)
2088 return 1;
2089 }
2090
2091 /* Otherwise, compare UTC time */
2092 if (af->current_realtime < bf->current_realtime)
2093 return -1;
2094 if (af->current_realtime > bf->current_realtime)
2095 return 1;
2096
2097 /* Finally, compare by contents */
2098 if (af->current_xor_hash < bf->current_xor_hash)
2099 return -1;
2100 if (af->current_xor_hash > bf->current_xor_hash)
2101 return 1;
2102
2103 return 0;
2104}
2105
de190aef
LP
2106int journal_file_next_entry(
2107 JournalFile *f,
f534928a 2108 uint64_t p,
de190aef
LP
2109 direction_t direction,
2110 Object **ret, uint64_t *offset) {
2111
fb099c8d 2112 uint64_t i, n, ofs;
cec736d2
LP
2113 int r;
2114
2115 assert(f);
de190aef
LP
2116
2117 n = le64toh(f->header->n_entries);
2118 if (n <= 0)
2119 return 0;
cec736d2 2120
f534928a 2121 if (p == 0)
de190aef 2122 i = direction == DIRECTION_DOWN ? 0 : n - 1;
cec736d2 2123 else {
de190aef
LP
2124 r = generic_array_bisect(f,
2125 le64toh(f->header->entry_array_offset),
2126 le64toh(f->header->n_entries),
2127 p,
2128 test_object_offset,
2129 DIRECTION_DOWN,
2130 NULL, NULL,
2131 &i);
2132 if (r <= 0)
2133 return r;
2134
2135 if (direction == DIRECTION_DOWN) {
2136 if (i >= n - 1)
2137 return 0;
2138
2139 i++;
2140 } else {
2141 if (i <= 0)
2142 return 0;
2143
2144 i--;
2145 }
cec736d2
LP
2146 }
2147
de190aef 2148 /* And jump to it */
fb099c8d
ZJS
2149 r = generic_array_get(f,
2150 le64toh(f->header->entry_array_offset),
2151 i,
2152 ret, &ofs);
2153 if (r <= 0)
2154 return r;
2155
2156 if (p > 0 &&
2157 (direction == DIRECTION_DOWN ? ofs <= p : ofs >= p)) {
2158 log_debug("%s: entry array corrupted at entry %"PRIu64,
2159 f->path, i);
2160 return -EBADMSG;
2161 }
2162
2163 if (offset)
2164 *offset = ofs;
2165
2166 return 1;
de190aef 2167}
cec736d2 2168
de190aef
LP
2169int journal_file_next_entry_for_data(
2170 JournalFile *f,
2171 Object *o, uint64_t p,
2172 uint64_t data_offset,
2173 direction_t direction,
2174 Object **ret, uint64_t *offset) {
2175
2176 uint64_t n, i;
cec736d2 2177 int r;
de190aef 2178 Object *d;
cec736d2
LP
2179
2180 assert(f);
de190aef 2181 assert(p > 0 || !o);
cec736d2 2182
de190aef 2183 r = journal_file_move_to_object(f, OBJECT_DATA, data_offset, &d);
466ccd92 2184 if (r < 0)
de190aef 2185 return r;
cec736d2 2186
de190aef
LP
2187 n = le64toh(d->data.n_entries);
2188 if (n <= 0)
2189 return n;
cec736d2 2190
de190aef
LP
2191 if (!o)
2192 i = direction == DIRECTION_DOWN ? 0 : n - 1;
2193 else {
2194 if (o->object.type != OBJECT_ENTRY)
2195 return -EINVAL;
cec736d2 2196
de190aef
LP
2197 r = generic_array_bisect_plus_one(f,
2198 le64toh(d->data.entry_offset),
2199 le64toh(d->data.entry_array_offset),
2200 le64toh(d->data.n_entries),
2201 p,
2202 test_object_offset,
2203 DIRECTION_DOWN,
2204 NULL, NULL,
2205 &i);
2206
2207 if (r <= 0)
cec736d2
LP
2208 return r;
2209
de190aef
LP
2210 if (direction == DIRECTION_DOWN) {
2211 if (i >= n - 1)
2212 return 0;
cec736d2 2213
de190aef
LP
2214 i++;
2215 } else {
2216 if (i <= 0)
2217 return 0;
cec736d2 2218
de190aef
LP
2219 i--;
2220 }
cec736d2 2221
de190aef 2222 }
cec736d2 2223
de190aef
LP
2224 return generic_array_get_plus_one(f,
2225 le64toh(d->data.entry_offset),
2226 le64toh(d->data.entry_array_offset),
2227 i,
2228 ret, offset);
2229}
cec736d2 2230
cbdca852
LP
2231int journal_file_move_to_entry_by_offset_for_data(
2232 JournalFile *f,
2233 uint64_t data_offset,
2234 uint64_t p,
2235 direction_t direction,
2236 Object **ret, uint64_t *offset) {
2237
2238 int r;
2239 Object *d;
2240
2241 assert(f);
2242
2243 r = journal_file_move_to_object(f, OBJECT_DATA, data_offset, &d);
2244 if (r < 0)
2245 return r;
2246
2247 return generic_array_bisect_plus_one(f,
2248 le64toh(d->data.entry_offset),
2249 le64toh(d->data.entry_array_offset),
2250 le64toh(d->data.n_entries),
2251 p,
2252 test_object_offset,
2253 direction,
2254 ret, offset, NULL);
2255}
2256
2257int journal_file_move_to_entry_by_monotonic_for_data(
2258 JournalFile *f,
2259 uint64_t data_offset,
2260 sd_id128_t boot_id,
2261 uint64_t monotonic,
2262 direction_t direction,
2263 Object **ret, uint64_t *offset) {
2264
cbdca852
LP
2265 Object *o, *d;
2266 int r;
2267 uint64_t b, z;
2268
2269 assert(f);
2270
2271 /* First, seek by time */
47838ab3 2272 r = find_data_object_by_boot_id(f, boot_id, &o, &b);
cbdca852
LP
2273 if (r < 0)
2274 return r;
2275 if (r == 0)
2276 return -ENOENT;
2277
2278 r = generic_array_bisect_plus_one(f,
2279 le64toh(o->data.entry_offset),
2280 le64toh(o->data.entry_array_offset),
2281 le64toh(o->data.n_entries),
2282 monotonic,
2283 test_object_monotonic,
2284 direction,
2285 NULL, &z, NULL);
2286 if (r <= 0)
2287 return r;
2288
2289 /* And now, continue seeking until we find an entry that
2290 * exists in both bisection arrays */
2291
2292 for (;;) {
2293 Object *qo;
2294 uint64_t p, q;
2295
2296 r = journal_file_move_to_object(f, OBJECT_DATA, data_offset, &d);
2297 if (r < 0)
2298 return r;
2299
2300 r = generic_array_bisect_plus_one(f,
2301 le64toh(d->data.entry_offset),
2302 le64toh(d->data.entry_array_offset),
2303 le64toh(d->data.n_entries),
2304 z,
2305 test_object_offset,
2306 direction,
2307 NULL, &p, NULL);
2308 if (r <= 0)
2309 return r;
2310
2311 r = journal_file_move_to_object(f, OBJECT_DATA, b, &o);
2312 if (r < 0)
2313 return r;
2314
2315 r = generic_array_bisect_plus_one(f,
2316 le64toh(o->data.entry_offset),
2317 le64toh(o->data.entry_array_offset),
2318 le64toh(o->data.n_entries),
2319 p,
2320 test_object_offset,
2321 direction,
2322 &qo, &q, NULL);
2323
2324 if (r <= 0)
2325 return r;
2326
2327 if (p == q) {
2328 if (ret)
2329 *ret = qo;
2330 if (offset)
2331 *offset = q;
2332
2333 return 1;
2334 }
2335
2336 z = q;
2337 }
cbdca852
LP
2338}
2339
de190aef
LP
2340int journal_file_move_to_entry_by_seqnum_for_data(
2341 JournalFile *f,
2342 uint64_t data_offset,
2343 uint64_t seqnum,
2344 direction_t direction,
2345 Object **ret, uint64_t *offset) {
cec736d2 2346
de190aef
LP
2347 Object *d;
2348 int r;
cec736d2 2349
91a31dde
LP
2350 assert(f);
2351
de190aef 2352 r = journal_file_move_to_object(f, OBJECT_DATA, data_offset, &d);
91a31dde 2353 if (r < 0)
de190aef 2354 return r;
cec736d2 2355
de190aef
LP
2356 return generic_array_bisect_plus_one(f,
2357 le64toh(d->data.entry_offset),
2358 le64toh(d->data.entry_array_offset),
2359 le64toh(d->data.n_entries),
2360 seqnum,
2361 test_object_seqnum,
2362 direction,
2363 ret, offset, NULL);
2364}
cec736d2 2365
de190aef
LP
2366int journal_file_move_to_entry_by_realtime_for_data(
2367 JournalFile *f,
2368 uint64_t data_offset,
2369 uint64_t realtime,
2370 direction_t direction,
2371 Object **ret, uint64_t *offset) {
2372
2373 Object *d;
2374 int r;
2375
91a31dde
LP
2376 assert(f);
2377
de190aef 2378 r = journal_file_move_to_object(f, OBJECT_DATA, data_offset, &d);
91a31dde 2379 if (r < 0)
de190aef
LP
2380 return r;
2381
2382 return generic_array_bisect_plus_one(f,
2383 le64toh(d->data.entry_offset),
2384 le64toh(d->data.entry_array_offset),
2385 le64toh(d->data.n_entries),
2386 realtime,
2387 test_object_realtime,
2388 direction,
2389 ret, offset, NULL);
cec736d2
LP
2390}
2391
0284adc6 2392void journal_file_dump(JournalFile *f) {
7560fffc 2393 Object *o;
7560fffc 2394 int r;
0284adc6 2395 uint64_t p;
7560fffc
LP
2396
2397 assert(f);
2398
0284adc6 2399 journal_file_print_header(f);
7560fffc 2400
0284adc6
LP
2401 p = le64toh(f->header->header_size);
2402 while (p != 0) {
d05089d8 2403 r = journal_file_move_to_object(f, OBJECT_UNUSED, p, &o);
0284adc6
LP
2404 if (r < 0)
2405 goto fail;
7560fffc 2406
0284adc6 2407 switch (o->object.type) {
d98cc1f2 2408
0284adc6
LP
2409 case OBJECT_UNUSED:
2410 printf("Type: OBJECT_UNUSED\n");
2411 break;
d98cc1f2 2412
0284adc6
LP
2413 case OBJECT_DATA:
2414 printf("Type: OBJECT_DATA\n");
2415 break;
7560fffc 2416
3c1668da
LP
2417 case OBJECT_FIELD:
2418 printf("Type: OBJECT_FIELD\n");
2419 break;
2420
0284adc6 2421 case OBJECT_ENTRY:
507f22bd
ZJS
2422 printf("Type: OBJECT_ENTRY seqnum=%"PRIu64" monotonic=%"PRIu64" realtime=%"PRIu64"\n",
2423 le64toh(o->entry.seqnum),
2424 le64toh(o->entry.monotonic),
2425 le64toh(o->entry.realtime));
0284adc6 2426 break;
7560fffc 2427
0284adc6
LP
2428 case OBJECT_FIELD_HASH_TABLE:
2429 printf("Type: OBJECT_FIELD_HASH_TABLE\n");
2430 break;
7560fffc 2431
0284adc6
LP
2432 case OBJECT_DATA_HASH_TABLE:
2433 printf("Type: OBJECT_DATA_HASH_TABLE\n");
2434 break;
7560fffc 2435
0284adc6
LP
2436 case OBJECT_ENTRY_ARRAY:
2437 printf("Type: OBJECT_ENTRY_ARRAY\n");
2438 break;
7560fffc 2439
0284adc6 2440 case OBJECT_TAG:
507f22bd
ZJS
2441 printf("Type: OBJECT_TAG seqnum=%"PRIu64" epoch=%"PRIu64"\n",
2442 le64toh(o->tag.seqnum),
2443 le64toh(o->tag.epoch));
0284adc6 2444 break;
3c1668da
LP
2445
2446 default:
8facc349 2447 printf("Type: unknown (%i)\n", o->object.type);
3c1668da 2448 break;
0284adc6 2449 }
7560fffc 2450
d89c8fdf
ZJS
2451 if (o->object.flags & OBJECT_COMPRESSION_MASK)
2452 printf("Flags: %s\n",
2453 object_compressed_to_string(o->object.flags & OBJECT_COMPRESSION_MASK));
7560fffc 2454
0284adc6
LP
2455 if (p == le64toh(f->header->tail_object_offset))
2456 p = 0;
2457 else
2458 p = p + ALIGN64(le64toh(o->object.size));
2459 }
7560fffc 2460
0284adc6
LP
2461 return;
2462fail:
2463 log_error("File corrupt");
7560fffc
LP
2464}
2465
718fe4b1
ZJS
2466static const char* format_timestamp_safe(char *buf, size_t l, usec_t t) {
2467 const char *x;
2468
2469 x = format_timestamp(buf, l, t);
2470 if (x)
2471 return x;
2472 return " --- ";
2473}
2474
0284adc6 2475void journal_file_print_header(JournalFile *f) {
2765b7bb 2476 char a[33], b[33], c[33], d[33];
ed375beb 2477 char x[FORMAT_TIMESTAMP_MAX], y[FORMAT_TIMESTAMP_MAX], z[FORMAT_TIMESTAMP_MAX];
a1a03e30
LP
2478 struct stat st;
2479 char bytes[FORMAT_BYTES_MAX];
7560fffc
LP
2480
2481 assert(f);
7560fffc 2482
0284adc6
LP
2483 printf("File Path: %s\n"
2484 "File ID: %s\n"
2485 "Machine ID: %s\n"
2486 "Boot ID: %s\n"
2487 "Sequential Number ID: %s\n"
2488 "State: %s\n"
2489 "Compatible Flags:%s%s\n"
d89c8fdf 2490 "Incompatible Flags:%s%s%s\n"
507f22bd
ZJS
2491 "Header size: %"PRIu64"\n"
2492 "Arena size: %"PRIu64"\n"
2493 "Data Hash Table Size: %"PRIu64"\n"
2494 "Field Hash Table Size: %"PRIu64"\n"
0284adc6 2495 "Rotate Suggested: %s\n"
507f22bd
ZJS
2496 "Head Sequential Number: %"PRIu64"\n"
2497 "Tail Sequential Number: %"PRIu64"\n"
0284adc6 2498 "Head Realtime Timestamp: %s\n"
3223f44f 2499 "Tail Realtime Timestamp: %s\n"
ed375beb 2500 "Tail Monotonic Timestamp: %s\n"
507f22bd
ZJS
2501 "Objects: %"PRIu64"\n"
2502 "Entry Objects: %"PRIu64"\n",
0284adc6
LP
2503 f->path,
2504 sd_id128_to_string(f->header->file_id, a),
2505 sd_id128_to_string(f->header->machine_id, b),
2506 sd_id128_to_string(f->header->boot_id, c),
2765b7bb 2507 sd_id128_to_string(f->header->seqnum_id, d),
3223f44f
LP
2508 f->header->state == STATE_OFFLINE ? "OFFLINE" :
2509 f->header->state == STATE_ONLINE ? "ONLINE" :
2510 f->header->state == STATE_ARCHIVED ? "ARCHIVED" : "UNKNOWN",
8088cbd3 2511 JOURNAL_HEADER_SEALED(f->header) ? " SEALED" : "",
d89c8fdf
ZJS
2512 (le32toh(f->header->compatible_flags) & ~HEADER_COMPATIBLE_ANY) ? " ???" : "",
2513 JOURNAL_HEADER_COMPRESSED_XZ(f->header) ? " COMPRESSED-XZ" : "",
2514 JOURNAL_HEADER_COMPRESSED_LZ4(f->header) ? " COMPRESSED-LZ4" : "",
2515 (le32toh(f->header->incompatible_flags) & ~HEADER_INCOMPATIBLE_ANY) ? " ???" : "",
507f22bd
ZJS
2516 le64toh(f->header->header_size),
2517 le64toh(f->header->arena_size),
2518 le64toh(f->header->data_hash_table_size) / sizeof(HashItem),
2519 le64toh(f->header->field_hash_table_size) / sizeof(HashItem),
fb0951b0 2520 yes_no(journal_file_rotate_suggested(f, 0)),
507f22bd
ZJS
2521 le64toh(f->header->head_entry_seqnum),
2522 le64toh(f->header->tail_entry_seqnum),
718fe4b1
ZJS
2523 format_timestamp_safe(x, sizeof(x), le64toh(f->header->head_entry_realtime)),
2524 format_timestamp_safe(y, sizeof(y), le64toh(f->header->tail_entry_realtime)),
ed375beb 2525 format_timespan(z, sizeof(z), le64toh(f->header->tail_entry_monotonic), USEC_PER_MSEC),
507f22bd
ZJS
2526 le64toh(f->header->n_objects),
2527 le64toh(f->header->n_entries));
7560fffc 2528
0284adc6 2529 if (JOURNAL_HEADER_CONTAINS(f->header, n_data))
507f22bd 2530 printf("Data Objects: %"PRIu64"\n"
0284adc6 2531 "Data Hash Table Fill: %.1f%%\n",
507f22bd 2532 le64toh(f->header->n_data),
0284adc6 2533 100.0 * (double) le64toh(f->header->n_data) / ((double) (le64toh(f->header->data_hash_table_size) / sizeof(HashItem))));
7560fffc 2534
0284adc6 2535 if (JOURNAL_HEADER_CONTAINS(f->header, n_fields))
507f22bd 2536 printf("Field Objects: %"PRIu64"\n"
0284adc6 2537 "Field Hash Table Fill: %.1f%%\n",
507f22bd 2538 le64toh(f->header->n_fields),
0284adc6 2539 100.0 * (double) le64toh(f->header->n_fields) / ((double) (le64toh(f->header->field_hash_table_size) / sizeof(HashItem))));
3223f44f
LP
2540
2541 if (JOURNAL_HEADER_CONTAINS(f->header, n_tags))
507f22bd
ZJS
2542 printf("Tag Objects: %"PRIu64"\n",
2543 le64toh(f->header->n_tags));
3223f44f 2544 if (JOURNAL_HEADER_CONTAINS(f->header, n_entry_arrays))
507f22bd
ZJS
2545 printf("Entry Array Objects: %"PRIu64"\n",
2546 le64toh(f->header->n_entry_arrays));
a1a03e30
LP
2547
2548 if (fstat(f->fd, &st) >= 0)
59f448cf 2549 printf("Disk usage: %s\n", format_bytes(bytes, sizeof(bytes), (uint64_t) st.st_blocks * 512ULL));
7560fffc
LP
2550}
2551
fc68c929
LP
2552static int journal_file_warn_btrfs(JournalFile *f) {
2553 unsigned attrs;
2554 int r;
2555
2556 assert(f);
2557
2558 /* Before we write anything, check if the COW logic is turned
2559 * off on btrfs. Given our write pattern that is quite
2560 * unfriendly to COW file systems this should greatly improve
2561 * performance on COW file systems, such as btrfs, at the
2562 * expense of data integrity features (which shouldn't be too
2563 * bad, given that we do our own checksumming). */
2564
2565 r = btrfs_is_filesystem(f->fd);
2566 if (r < 0)
2567 return log_warning_errno(r, "Failed to determine if journal is on btrfs: %m");
2568 if (!r)
2569 return 0;
2570
2571 r = read_attr_fd(f->fd, &attrs);
2572 if (r < 0)
2573 return log_warning_errno(r, "Failed to read file attributes: %m");
2574
2575 if (attrs & FS_NOCOW_FL) {
2576 log_debug("Detected btrfs file system with copy-on-write disabled, all is good.");
2577 return 0;
2578 }
2579
2580 log_notice("Creating journal file %s on a btrfs file system, and copy-on-write is enabled. "
2581 "This is likely to slow down journal access substantially, please consider turning "
2582 "off the copy-on-write file attribute on the journal directory, using chattr +C.", f->path);
2583
2584 return 1;
2585}
2586
0284adc6
LP
2587int journal_file_open(
2588 const char *fname,
2589 int flags,
2590 mode_t mode,
2591 bool compress,
baed47c3 2592 bool seal,
0284adc6
LP
2593 JournalMetrics *metrics,
2594 MMapCache *mmap_cache,
2595 JournalFile *template,
2596 JournalFile **ret) {
7560fffc 2597
fa6ac760 2598 bool newly_created = false;
0284adc6 2599 JournalFile *f;
fa6ac760 2600 void *h;
0284adc6 2601 int r;
7560fffc 2602
0284adc6 2603 assert(fname);
0559d3a5 2604 assert(ret);
7560fffc 2605
0284adc6
LP
2606 if ((flags & O_ACCMODE) != O_RDONLY &&
2607 (flags & O_ACCMODE) != O_RDWR)
2608 return -EINVAL;
7560fffc 2609
a0108012
LP
2610 if (!endswith(fname, ".journal") &&
2611 !endswith(fname, ".journal~"))
0284adc6 2612 return -EINVAL;
7560fffc 2613
0284adc6
LP
2614 f = new0(JournalFile, 1);
2615 if (!f)
2616 return -ENOMEM;
7560fffc 2617
0284adc6
LP
2618 f->fd = -1;
2619 f->mode = mode;
7560fffc 2620
0284adc6
LP
2621 f->flags = flags;
2622 f->prot = prot_from_flags(flags);
2623 f->writable = (flags & O_ACCMODE) != O_RDONLY;
92261977 2624#if defined(HAVE_LZ4)
d89c8fdf
ZJS
2625 f->compress_lz4 = compress;
2626#elif defined(HAVE_XZ)
2627 f->compress_xz = compress;
48b61739 2628#endif
49a32d43 2629#ifdef HAVE_GCRYPT
baed47c3 2630 f->seal = seal;
49a32d43 2631#endif
7560fffc 2632
0284adc6
LP
2633 if (mmap_cache)
2634 f->mmap = mmap_cache_ref(mmap_cache);
2635 else {
84168d80 2636 f->mmap = mmap_cache_new();
0284adc6
LP
2637 if (!f->mmap) {
2638 r = -ENOMEM;
2639 goto fail;
2640 }
2641 }
7560fffc 2642
0284adc6
LP
2643 f->path = strdup(fname);
2644 if (!f->path) {
2645 r = -ENOMEM;
2646 goto fail;
2647 }
7560fffc 2648
4743015d 2649 f->chain_cache = ordered_hashmap_new(&uint64_hash_ops);
a4bcff5b
LP
2650 if (!f->chain_cache) {
2651 r = -ENOMEM;
2652 goto fail;
2653 }
2654
0284adc6
LP
2655 f->fd = open(f->path, f->flags|O_CLOEXEC, f->mode);
2656 if (f->fd < 0) {
2657 r = -errno;
2658 goto fail;
7560fffc 2659 }
7560fffc 2660
2678031a
LP
2661 r = journal_file_fstat(f);
2662 if (r < 0)
0284adc6 2663 goto fail;
7560fffc 2664
0284adc6 2665 if (f->last_stat.st_size == 0 && f->writable) {
11689d2a 2666
fc68c929 2667 (void) journal_file_warn_btrfs(f);
11689d2a 2668
fb0951b0
LP
2669 /* Let's attach the creation time to the journal file,
2670 * so that the vacuuming code knows the age of this
2671 * file even if the file might end up corrupted one
2672 * day... Ideally we'd just use the creation time many
2673 * file systems maintain for each file, but there is
2674 * currently no usable API to query this, hence let's
2675 * emulate this via extended attributes. If extended
2676 * attributes are not supported we'll just skip this,
7517e174 2677 * and rely solely on mtime/atime/ctime of the file. */
fb0951b0 2678
d61b600d 2679 fd_setcrtime(f->fd, 0);
7560fffc 2680
feb12d3e 2681#ifdef HAVE_GCRYPT
0284adc6 2682 /* Try to load the FSPRG state, and if we can't, then
baed47c3 2683 * just don't do sealing */
49a32d43
LP
2684 if (f->seal) {
2685 r = journal_file_fss_load(f);
2686 if (r < 0)
2687 f->seal = false;
2688 }
feb12d3e 2689#endif
7560fffc 2690
0284adc6
LP
2691 r = journal_file_init_header(f, template);
2692 if (r < 0)
2693 goto fail;
7560fffc 2694
2678031a
LP
2695 r = journal_file_fstat(f);
2696 if (r < 0)
0284adc6 2697 goto fail;
fb0951b0
LP
2698
2699 newly_created = true;
0284adc6 2700 }
7560fffc 2701
0284adc6
LP
2702 if (f->last_stat.st_size < (off_t) HEADER_SIZE_MIN) {
2703 r = -EIO;
2704 goto fail;
2705 }
7560fffc 2706
fa6ac760 2707 r = mmap_cache_get(f->mmap, f->fd, f->prot, CONTEXT_HEADER, true, 0, PAGE_ALIGN(sizeof(Header)), &f->last_stat, &h);
977eaa1e 2708 if (r < 0)
0284adc6 2709 goto fail;
7560fffc 2710
fa6ac760
LP
2711 f->header = h;
2712
0284adc6
LP
2713 if (!newly_created) {
2714 r = journal_file_verify_header(f);
2715 if (r < 0)
2716 goto fail;
2717 }
7560fffc 2718
feb12d3e 2719#ifdef HAVE_GCRYPT
0284adc6 2720 if (!newly_created && f->writable) {
baed47c3 2721 r = journal_file_fss_load(f);
0284adc6
LP
2722 if (r < 0)
2723 goto fail;
2724 }
feb12d3e 2725#endif
cec736d2
LP
2726
2727 if (f->writable) {
4a92baf3
LP
2728 if (metrics) {
2729 journal_default_metrics(metrics, f->fd);
2730 f->metrics = *metrics;
2731 } else if (template)
2732 f->metrics = template->metrics;
2733
cec736d2
LP
2734 r = journal_file_refresh_header(f);
2735 if (r < 0)
2736 goto fail;
2737 }
2738
feb12d3e 2739#ifdef HAVE_GCRYPT
baed47c3 2740 r = journal_file_hmac_setup(f);
14d10188
LP
2741 if (r < 0)
2742 goto fail;
feb12d3e 2743#endif
14d10188 2744
cec736d2 2745 if (newly_created) {
de190aef 2746 r = journal_file_setup_field_hash_table(f);
cec736d2
LP
2747 if (r < 0)
2748 goto fail;
2749
de190aef 2750 r = journal_file_setup_data_hash_table(f);
cec736d2
LP
2751 if (r < 0)
2752 goto fail;
7560fffc 2753
feb12d3e 2754#ifdef HAVE_GCRYPT
7560fffc
LP
2755 r = journal_file_append_first_tag(f);
2756 if (r < 0)
2757 goto fail;
feb12d3e 2758#endif
cec736d2
LP
2759 }
2760
fa6ac760
LP
2761 if (mmap_cache_got_sigbus(f->mmap, f->fd)) {
2762 r = -EIO;
2763 goto fail;
2764 }
2765
0559d3a5 2766 *ret = f;
cec736d2
LP
2767 return 0;
2768
2769fail:
fa6ac760
LP
2770 if (f->fd >= 0 && mmap_cache_got_sigbus(f->mmap, f->fd))
2771 r = -EIO;
2772
cec736d2
LP
2773 journal_file_close(f);
2774
2775 return r;
2776}
0ac38b70 2777
baed47c3 2778int journal_file_rotate(JournalFile **f, bool compress, bool seal) {
57535f47 2779 _cleanup_free_ char *p = NULL;
0ac38b70
LP
2780 size_t l;
2781 JournalFile *old_file, *new_file = NULL;
2782 int r;
2783
2784 assert(f);
2785 assert(*f);
2786
2787 old_file = *f;
2788
2789 if (!old_file->writable)
2790 return -EINVAL;
2791
2792 if (!endswith(old_file->path, ".journal"))
2793 return -EINVAL;
2794
2795 l = strlen(old_file->path);
57535f47
ZJS
2796 r = asprintf(&p, "%.*s@" SD_ID128_FORMAT_STR "-%016"PRIx64"-%016"PRIx64".journal",
2797 (int) l - 8, old_file->path,
2798 SD_ID128_FORMAT_VAL(old_file->header->seqnum_id),
2799 le64toh((*f)->header->head_entry_seqnum),
2800 le64toh((*f)->header->head_entry_realtime));
2801 if (r < 0)
0ac38b70
LP
2802 return -ENOMEM;
2803
2678031a
LP
2804 /* Try to rename the file to the archived version. If the file
2805 * already was deleted, we'll get ENOENT, let's ignore that
2806 * case. */
0ac38b70 2807 r = rename(old_file->path, p);
2678031a 2808 if (r < 0 && errno != ENOENT)
0ac38b70
LP
2809 return -errno;
2810
ccdbaf91 2811 old_file->header->state = STATE_ARCHIVED;
0ac38b70 2812
f27a3864
LP
2813 /* Currently, btrfs is not very good with out write patterns
2814 * and fragments heavily. Let's defrag our journal files when
2815 * we archive them */
2816 old_file->defrag_on_close = true;
2817
baed47c3 2818 r = journal_file_open(old_file->path, old_file->flags, old_file->mode, compress, seal, NULL, old_file->mmap, old_file, &new_file);
0ac38b70
LP
2819 journal_file_close(old_file);
2820
2821 *f = new_file;
2822 return r;
2823}
2824
9447a7f1
LP
2825int journal_file_open_reliably(
2826 const char *fname,
2827 int flags,
2828 mode_t mode,
7560fffc 2829 bool compress,
baed47c3 2830 bool seal,
4a92baf3 2831 JournalMetrics *metrics,
27370278 2832 MMapCache *mmap_cache,
9447a7f1
LP
2833 JournalFile *template,
2834 JournalFile **ret) {
2835
2836 int r;
2837 size_t l;
ed375beb 2838 _cleanup_free_ char *p = NULL;
9447a7f1 2839
070052ab 2840 r = journal_file_open(fname, flags, mode, compress, seal, metrics, mmap_cache, template, ret);
288359db
ZJS
2841 if (!IN_SET(r,
2842 -EBADMSG, /* corrupted */
2843 -ENODATA, /* truncated */
2844 -EHOSTDOWN, /* other machine */
2845 -EPROTONOSUPPORT, /* incompatible feature */
2846 -EBUSY, /* unclean shutdown */
2847 -ESHUTDOWN, /* already archived */
2848 -EIO, /* IO error, including SIGBUS on mmap */
2849 -EIDRM /* File has been deleted */))
9447a7f1
LP
2850 return r;
2851
2852 if ((flags & O_ACCMODE) == O_RDONLY)
2853 return r;
2854
2855 if (!(flags & O_CREAT))
2856 return r;
2857
7560fffc
LP
2858 if (!endswith(fname, ".journal"))
2859 return r;
2860
5c70eab4
LP
2861 /* The file is corrupted. Rotate it away and try it again (but only once) */
2862
9447a7f1 2863 l = strlen(fname);
d587eca5 2864 if (asprintf(&p, "%.*s@%016"PRIx64 "-%016"PRIx64 ".journal~",
57535f47 2865 (int) l - 8, fname,
d587eca5 2866 now(CLOCK_REALTIME),
9bf3b535 2867 random_u64()) < 0)
9447a7f1
LP
2868 return -ENOMEM;
2869
65089b82 2870 if (rename(fname, p) < 0)
9447a7f1
LP
2871 return -errno;
2872
f27a3864
LP
2873 /* btrfs doesn't cope well with our write pattern and
2874 * fragments heavily. Let's defrag all files we rotate */
11689d2a
LP
2875
2876 (void) chattr_path(p, false, FS_NOCOW_FL);
f27a3864
LP
2877 (void) btrfs_defrag(p);
2878
65089b82 2879 log_warning_errno(r, "File %s corrupted or uncleanly shut down, renaming and replacing.", fname);
9447a7f1 2880
070052ab 2881 return journal_file_open(fname, flags, mode, compress, seal, metrics, mmap_cache, template, ret);
9447a7f1
LP
2882}
2883
cf244689
LP
2884int journal_file_copy_entry(JournalFile *from, JournalFile *to, Object *o, uint64_t p, uint64_t *seqnum, Object **ret, uint64_t *offset) {
2885 uint64_t i, n;
2886 uint64_t q, xor_hash = 0;
2887 int r;
2888 EntryItem *items;
2889 dual_timestamp ts;
2890
2891 assert(from);
2892 assert(to);
2893 assert(o);
2894 assert(p);
2895
2896 if (!to->writable)
2897 return -EPERM;
2898
2899 ts.monotonic = le64toh(o->entry.monotonic);
2900 ts.realtime = le64toh(o->entry.realtime);
2901
cf244689 2902 n = journal_file_entry_n_items(o);
4faa7004
TA
2903 /* alloca() can't take 0, hence let's allocate at least one */
2904 items = alloca(sizeof(EntryItem) * MAX(1u, n));
cf244689
LP
2905
2906 for (i = 0; i < n; i++) {
4fd052ae
FC
2907 uint64_t l, h;
2908 le64_t le_hash;
cf244689
LP
2909 size_t t;
2910 void *data;
2911 Object *u;
2912
2913 q = le64toh(o->entry.items[i].object_offset);
2914 le_hash = o->entry.items[i].hash;
2915
2916 r = journal_file_move_to_object(from, OBJECT_DATA, q, &o);
2917 if (r < 0)
2918 return r;
2919
2920 if (le_hash != o->data.hash)
2921 return -EBADMSG;
2922
2923 l = le64toh(o->object.size) - offsetof(Object, data.payload);
2924 t = (size_t) l;
2925
2926 /* We hit the limit on 32bit machines */
2927 if ((uint64_t) t != l)
2928 return -E2BIG;
2929
d89c8fdf 2930 if (o->object.flags & OBJECT_COMPRESSION_MASK) {
3b1a55e1 2931#if defined(HAVE_XZ) || defined(HAVE_LZ4)
a7f7d1bd 2932 size_t rsize = 0;
cf244689 2933
d89c8fdf
ZJS
2934 r = decompress_blob(o->object.flags & OBJECT_COMPRESSION_MASK,
2935 o->data.payload, l, &from->compress_buffer, &from->compress_buffer_size, &rsize, 0);
2936 if (r < 0)
2937 return r;
cf244689
LP
2938
2939 data = from->compress_buffer;
2940 l = rsize;
3b1a55e1
ZJS
2941#else
2942 return -EPROTONOSUPPORT;
2943#endif
cf244689
LP
2944 } else
2945 data = o->data.payload;
2946
2947 r = journal_file_append_data(to, data, l, &u, &h);
2948 if (r < 0)
2949 return r;
2950
2951 xor_hash ^= le64toh(u->data.hash);
2952 items[i].object_offset = htole64(h);
2953 items[i].hash = u->data.hash;
2954
2955 r = journal_file_move_to_object(from, OBJECT_ENTRY, p, &o);
2956 if (r < 0)
2957 return r;
2958 }
2959
fa6ac760
LP
2960 r = journal_file_append_entry_internal(to, &ts, xor_hash, items, n, seqnum, ret, offset);
2961
2962 if (mmap_cache_got_sigbus(to->mmap, to->fd))
2963 return -EIO;
2964
2965 return r;
cf244689 2966}
babfc091 2967
8580d1f7
LP
2968void journal_reset_metrics(JournalMetrics *m) {
2969 assert(m);
2970
2971 /* Set everything to "pick automatic values". */
2972
2973 *m = (JournalMetrics) {
2974 .min_use = (uint64_t) -1,
2975 .max_use = (uint64_t) -1,
2976 .min_size = (uint64_t) -1,
2977 .max_size = (uint64_t) -1,
2978 .keep_free = (uint64_t) -1,
2979 .n_max_files = (uint64_t) -1,
2980 };
2981}
2982
babfc091 2983void journal_default_metrics(JournalMetrics *m, int fd) {
8580d1f7 2984 char a[FORMAT_BYTES_MAX], b[FORMAT_BYTES_MAX], c[FORMAT_BYTES_MAX], d[FORMAT_BYTES_MAX], e[FORMAT_BYTES_MAX];
babfc091 2985 struct statvfs ss;
8580d1f7 2986 uint64_t fs_size;
babfc091
LP
2987
2988 assert(m);
2989 assert(fd >= 0);
2990
2991 if (fstatvfs(fd, &ss) >= 0)
2992 fs_size = ss.f_frsize * ss.f_blocks;
8580d1f7
LP
2993 else {
2994 log_debug_errno(errno, "Failed to detremine disk size: %m");
2995 fs_size = 0;
2996 }
babfc091
LP
2997
2998 if (m->max_use == (uint64_t) -1) {
2999
3000 if (fs_size > 0) {
3001 m->max_use = PAGE_ALIGN(fs_size / 10); /* 10% of file system size */
3002
3003 if (m->max_use > DEFAULT_MAX_USE_UPPER)
3004 m->max_use = DEFAULT_MAX_USE_UPPER;
3005
3006 if (m->max_use < DEFAULT_MAX_USE_LOWER)
3007 m->max_use = DEFAULT_MAX_USE_LOWER;
3008 } else
3009 m->max_use = DEFAULT_MAX_USE_LOWER;
3010 } else {
3011 m->max_use = PAGE_ALIGN(m->max_use);
3012
8580d1f7 3013 if (m->max_use != 0 && m->max_use < JOURNAL_FILE_SIZE_MIN*2)
babfc091
LP
3014 m->max_use = JOURNAL_FILE_SIZE_MIN*2;
3015 }
3016
8580d1f7
LP
3017 if (m->min_use == (uint64_t) -1)
3018 m->min_use = DEFAULT_MIN_USE;
3019
3020 if (m->min_use > m->max_use)
3021 m->min_use = m->max_use;
3022
babfc091
LP
3023 if (m->max_size == (uint64_t) -1) {
3024 m->max_size = PAGE_ALIGN(m->max_use / 8); /* 8 chunks */
3025
3026 if (m->max_size > DEFAULT_MAX_SIZE_UPPER)
3027 m->max_size = DEFAULT_MAX_SIZE_UPPER;
3028 } else
3029 m->max_size = PAGE_ALIGN(m->max_size);
3030
8580d1f7
LP
3031 if (m->max_size != 0) {
3032 if (m->max_size < JOURNAL_FILE_SIZE_MIN)
3033 m->max_size = JOURNAL_FILE_SIZE_MIN;
babfc091 3034
8580d1f7
LP
3035 if (m->max_use != 0 && m->max_size*2 > m->max_use)
3036 m->max_use = m->max_size*2;
3037 }
babfc091
LP
3038
3039 if (m->min_size == (uint64_t) -1)
3040 m->min_size = JOURNAL_FILE_SIZE_MIN;
3041 else {
3042 m->min_size = PAGE_ALIGN(m->min_size);
3043
3044 if (m->min_size < JOURNAL_FILE_SIZE_MIN)
3045 m->min_size = JOURNAL_FILE_SIZE_MIN;
3046
8580d1f7 3047 if (m->max_size != 0 && m->min_size > m->max_size)
babfc091
LP
3048 m->max_size = m->min_size;
3049 }
3050
3051 if (m->keep_free == (uint64_t) -1) {
3052
3053 if (fs_size > 0) {
8621b110 3054 m->keep_free = PAGE_ALIGN(fs_size * 3 / 20); /* 15% of file system size */
babfc091
LP
3055
3056 if (m->keep_free > DEFAULT_KEEP_FREE_UPPER)
3057 m->keep_free = DEFAULT_KEEP_FREE_UPPER;
3058
3059 } else
3060 m->keep_free = DEFAULT_KEEP_FREE;
3061 }
3062
8580d1f7
LP
3063 if (m->n_max_files == (uint64_t) -1)
3064 m->n_max_files = DEFAULT_N_MAX_FILES;
3065
3066 log_debug("Fixed min_use=%s max_use=%s max_size=%s min_size=%s keep_free=%s n_max_files=%" PRIu64,
3067 format_bytes(a, sizeof(a), m->min_use),
3068 format_bytes(b, sizeof(b), m->max_use),
3069 format_bytes(c, sizeof(c), m->max_size),
3070 format_bytes(d, sizeof(d), m->min_size),
3071 format_bytes(e, sizeof(e), m->keep_free),
3072 m->n_max_files);
babfc091 3073}
08984293
LP
3074
3075int journal_file_get_cutoff_realtime_usec(JournalFile *f, usec_t *from, usec_t *to) {
08984293
LP
3076 assert(f);
3077 assert(from || to);
3078
3079 if (from) {
162566a4
LP
3080 if (f->header->head_entry_realtime == 0)
3081 return -ENOENT;
08984293 3082
162566a4 3083 *from = le64toh(f->header->head_entry_realtime);
08984293
LP
3084 }
3085
3086 if (to) {
162566a4
LP
3087 if (f->header->tail_entry_realtime == 0)
3088 return -ENOENT;
08984293 3089
162566a4 3090 *to = le64toh(f->header->tail_entry_realtime);
08984293
LP
3091 }
3092
3093 return 1;
3094}
3095
3096int journal_file_get_cutoff_monotonic_usec(JournalFile *f, sd_id128_t boot_id, usec_t *from, usec_t *to) {
08984293
LP
3097 Object *o;
3098 uint64_t p;
3099 int r;
3100
3101 assert(f);
3102 assert(from || to);
3103
47838ab3 3104 r = find_data_object_by_boot_id(f, boot_id, &o, &p);
08984293
LP
3105 if (r <= 0)
3106 return r;
3107
3108 if (le64toh(o->data.n_entries) <= 0)
3109 return 0;
3110
3111 if (from) {
3112 r = journal_file_move_to_object(f, OBJECT_ENTRY, le64toh(o->data.entry_offset), &o);
3113 if (r < 0)
3114 return r;
3115
3116 *from = le64toh(o->entry.monotonic);
3117 }
3118
3119 if (to) {
3120 r = journal_file_move_to_object(f, OBJECT_DATA, p, &o);
3121 if (r < 0)
3122 return r;
3123
3124 r = generic_array_get_plus_one(f,
3125 le64toh(o->data.entry_offset),
3126 le64toh(o->data.entry_array_offset),
3127 le64toh(o->data.n_entries)-1,
3128 &o, NULL);
3129 if (r <= 0)
3130 return r;
3131
3132 *to = le64toh(o->entry.monotonic);
3133 }
3134
3135 return 1;
3136}
dca6219e 3137
fb0951b0 3138bool journal_file_rotate_suggested(JournalFile *f, usec_t max_file_usec) {
dca6219e
LP
3139 assert(f);
3140
3141 /* If we gained new header fields we gained new features,
3142 * hence suggest a rotation */
361f9cbc
LP
3143 if (le64toh(f->header->header_size) < sizeof(Header)) {
3144 log_debug("%s uses an outdated header, suggesting rotation.", f->path);
dca6219e 3145 return true;
361f9cbc 3146 }
dca6219e
LP
3147
3148 /* Let's check if the hash tables grew over a certain fill
3149 * level (75%, borrowing this value from Java's hash table
3150 * implementation), and if so suggest a rotation. To calculate
3151 * the fill level we need the n_data field, which only exists
3152 * in newer versions. */
3153
3154 if (JOURNAL_HEADER_CONTAINS(f->header, n_data))
361f9cbc 3155 if (le64toh(f->header->n_data) * 4ULL > (le64toh(f->header->data_hash_table_size) / sizeof(HashItem)) * 3ULL) {
507f22bd 3156 log_debug("Data hash table of %s has a fill level at %.1f (%"PRIu64" of %"PRIu64" items, %llu file size, %"PRIu64" bytes per hash table item), suggesting rotation.",
361f9cbc
LP
3157 f->path,
3158 100.0 * (double) le64toh(f->header->n_data) / ((double) (le64toh(f->header->data_hash_table_size) / sizeof(HashItem))),
507f22bd
ZJS
3159 le64toh(f->header->n_data),
3160 le64toh(f->header->data_hash_table_size) / sizeof(HashItem),
3161 (unsigned long long) f->last_stat.st_size,
3162 f->last_stat.st_size / le64toh(f->header->n_data));
dca6219e 3163 return true;
361f9cbc 3164 }
dca6219e
LP
3165
3166 if (JOURNAL_HEADER_CONTAINS(f->header, n_fields))
361f9cbc 3167 if (le64toh(f->header->n_fields) * 4ULL > (le64toh(f->header->field_hash_table_size) / sizeof(HashItem)) * 3ULL) {
507f22bd 3168 log_debug("Field hash table of %s has a fill level at %.1f (%"PRIu64" of %"PRIu64" items), suggesting rotation.",
361f9cbc
LP
3169 f->path,
3170 100.0 * (double) le64toh(f->header->n_fields) / ((double) (le64toh(f->header->field_hash_table_size) / sizeof(HashItem))),
507f22bd
ZJS
3171 le64toh(f->header->n_fields),
3172 le64toh(f->header->field_hash_table_size) / sizeof(HashItem));
dca6219e 3173 return true;
361f9cbc 3174 }
dca6219e 3175
0598fd4a
LP
3176 /* Are the data objects properly indexed by field objects? */
3177 if (JOURNAL_HEADER_CONTAINS(f->header, n_data) &&
3178 JOURNAL_HEADER_CONTAINS(f->header, n_fields) &&
3179 le64toh(f->header->n_data) > 0 &&
3180 le64toh(f->header->n_fields) == 0)
3181 return true;
3182
fb0951b0
LP
3183 if (max_file_usec > 0) {
3184 usec_t t, h;
3185
3186 h = le64toh(f->header->head_entry_realtime);
3187 t = now(CLOCK_REALTIME);
3188
3189 if (h > 0 && t > h + max_file_usec)
3190 return true;
3191 }
3192
dca6219e
LP
3193 return false;
3194}