]> git.ipfire.org Git - thirdparty/systemd.git/blame - src/journal/journal-file.c
journal: use automatic clenup for ACL types
[thirdparty/systemd.git] / src / journal / journal-file.c
CommitLineData
cec736d2
LP
1/*-*- Mode: C; c-basic-offset: 8; indent-tabs-mode: nil -*-*/
2
3/***
4 This file is part of systemd.
5
6 Copyright 2011 Lennart Poettering
7
8 systemd is free software; you can redistribute it and/or modify it
5430f7f2
LP
9 under the terms of the GNU Lesser General Public License as published by
10 the Free Software Foundation; either version 2.1 of the License, or
cec736d2
LP
11 (at your option) any later version.
12
13 systemd is distributed in the hope that it will be useful, but
14 WITHOUT ANY WARRANTY; without even the implied warranty of
15 MERCHANTABILITY or FITNESS FOR A PARTICULAR PURPOSE. See the GNU
5430f7f2 16 Lesser General Public License for more details.
cec736d2 17
5430f7f2 18 You should have received a copy of the GNU Lesser General Public License
cec736d2
LP
19 along with systemd; If not, see <http://www.gnu.org/licenses/>.
20***/
21
22#include <sys/mman.h>
23#include <errno.h>
24#include <sys/uio.h>
25#include <unistd.h>
26#include <sys/statvfs.h>
27#include <fcntl.h>
28#include <stddef.h>
11689d2a 29#include <linux/fs.h>
fb0951b0 30
f27a3864 31#include "btrfs-util.h"
cec736d2
LP
32#include "journal-def.h"
33#include "journal-file.h"
0284adc6 34#include "journal-authenticate.h"
cec736d2 35#include "lookup3.h"
807e17f0 36#include "compress.h"
3df3e884 37#include "random-util.h"
cec736d2 38
4a92baf3
LP
39#define DEFAULT_DATA_HASH_TABLE_SIZE (2047ULL*sizeof(HashItem))
40#define DEFAULT_FIELD_HASH_TABLE_SIZE (333ULL*sizeof(HashItem))
cec736d2 41
be19b7df 42#define COMPRESSION_SIZE_THRESHOLD (512ULL)
807e17f0 43
babfc091 44/* This is the minimum journal file size */
253f59df 45#define JOURNAL_FILE_SIZE_MIN (4ULL*1024ULL*1024ULL) /* 4 MiB */
babfc091
LP
46
47/* These are the lower and upper bounds if we deduce the max_use value
48 * from the file system size */
49#define DEFAULT_MAX_USE_LOWER (1ULL*1024ULL*1024ULL) /* 1 MiB */
50#define DEFAULT_MAX_USE_UPPER (4ULL*1024ULL*1024ULL*1024ULL) /* 4 GiB */
51
52/* This is the upper bound if we deduce max_size from max_use */
71100051 53#define DEFAULT_MAX_SIZE_UPPER (128ULL*1024ULL*1024ULL) /* 128 MiB */
babfc091
LP
54
55/* This is the upper bound if we deduce the keep_free value from the
56 * file system size */
57#define DEFAULT_KEEP_FREE_UPPER (4ULL*1024ULL*1024ULL*1024ULL) /* 4 GiB */
58
59/* This is the keep_free value when we can't determine the system
60 * size */
61#define DEFAULT_KEEP_FREE (1024ULL*1024ULL) /* 1 MB */
62
dca6219e
LP
63/* n_data was the first entry we added after the initial file format design */
64#define HEADER_SIZE_MIN ALIGN64(offsetof(Header, n_data))
cec736d2 65
a4bcff5b
LP
66/* How many entries to keep in the entry array chain cache at max */
67#define CHAIN_CACHE_MAX 20
68
a676e665
LP
69/* How much to increase the journal file size at once each time we allocate something new. */
70#define FILE_SIZE_INCREASE (8ULL*1024ULL*1024ULL) /* 8MB */
71
2678031a
LP
72/* Reread fstat() of the file for detecting deletions at least this often */
73#define LAST_STAT_REFRESH_USEC (5*USEC_PER_SEC)
74
fa6ac760
LP
75/* The mmap context to use for the header we pick as one above the last defined typed */
76#define CONTEXT_HEADER _OBJECT_TYPE_MAX
77
9588bc32 78static int journal_file_set_online(JournalFile *f) {
26687bf8
OS
79 assert(f);
80
81 if (!f->writable)
82 return -EPERM;
83
84 if (!(f->fd >= 0 && f->header))
85 return -EINVAL;
86
fa6ac760
LP
87 if (mmap_cache_got_sigbus(f->mmap, f->fd))
88 return -EIO;
89
26687bf8
OS
90 switch(f->header->state) {
91 case STATE_ONLINE:
92 return 0;
93
94 case STATE_OFFLINE:
95 f->header->state = STATE_ONLINE;
96 fsync(f->fd);
97 return 0;
98
99 default:
100 return -EINVAL;
101 }
102}
103
104int journal_file_set_offline(JournalFile *f) {
105 assert(f);
106
107 if (!f->writable)
108 return -EPERM;
109
110 if (!(f->fd >= 0 && f->header))
111 return -EINVAL;
112
113 if (f->header->state != STATE_ONLINE)
114 return 0;
115
116 fsync(f->fd);
117
fa6ac760
LP
118 if (mmap_cache_got_sigbus(f->mmap, f->fd))
119 return -EIO;
120
26687bf8
OS
121 f->header->state = STATE_OFFLINE;
122
fa6ac760
LP
123 if (mmap_cache_got_sigbus(f->mmap, f->fd))
124 return -EIO;
125
26687bf8
OS
126 fsync(f->fd);
127
128 return 0;
129}
130
804ae586 131JournalFile* journal_file_close(JournalFile *f) {
de190aef 132 assert(f);
cec736d2 133
feb12d3e 134#ifdef HAVE_GCRYPT
b0af6f41 135 /* Write the final tag */
c586dbf1 136 if (f->seal && f->writable)
b0af6f41 137 journal_file_append_tag(f);
feb12d3e 138#endif
b0af6f41 139
26687bf8 140 journal_file_set_offline(f);
cec736d2 141
fa6ac760
LP
142 if (f->mmap && f->fd >= 0)
143 mmap_cache_close_fd(f->mmap, f->fd);
cec736d2 144
11689d2a
LP
145 if (f->fd >= 0 && f->defrag_on_close) {
146
147 /* Be friendly to btrfs: turn COW back on again now,
148 * and defragment the file. We won't write to the file
149 * ever again, hence remove all fragmentation, and
150 * reenable all the good bits COW usually provides
151 * (such as data checksumming). */
152
1ed8f8c1 153 (void) chattr_fd(f->fd, 0, FS_NOCOW_FL);
11689d2a
LP
154 (void) btrfs_defrag_fd(f->fd);
155 }
f27a3864 156
03e334a1 157 safe_close(f->fd);
cec736d2 158 free(f->path);
807e17f0 159
16e9f408
LP
160 if (f->mmap)
161 mmap_cache_unref(f->mmap);
162
4743015d 163 ordered_hashmap_free_free(f->chain_cache);
a4bcff5b 164
d89c8fdf 165#if defined(HAVE_XZ) || defined(HAVE_LZ4)
807e17f0
LP
166 free(f->compress_buffer);
167#endif
168
7560fffc 169#ifdef HAVE_GCRYPT
baed47c3
LP
170 if (f->fss_file)
171 munmap(f->fss_file, PAGE_ALIGN(f->fss_file_size));
dc4ebc07 172 else
b7c9ae91
LP
173 free(f->fsprg_state);
174
175 free(f->fsprg_seed);
7560fffc
LP
176
177 if (f->hmac)
178 gcry_md_close(f->hmac);
179#endif
180
cec736d2 181 free(f);
804ae586 182 return NULL;
cec736d2
LP
183}
184
0ac38b70 185static int journal_file_init_header(JournalFile *f, JournalFile *template) {
d89c8fdf 186 Header h = {};
cec736d2
LP
187 ssize_t k;
188 int r;
189
190 assert(f);
191
7560fffc 192 memcpy(h.signature, HEADER_SIGNATURE, 8);
23b0b2b2 193 h.header_size = htole64(ALIGN64(sizeof(h)));
cec736d2 194
d89c8fdf
ZJS
195 h.incompatible_flags |= htole32(
196 f->compress_xz * HEADER_INCOMPATIBLE_COMPRESSED_XZ |
197 f->compress_lz4 * HEADER_INCOMPATIBLE_COMPRESSED_LZ4);
7560fffc 198
d89c8fdf
ZJS
199 h.compatible_flags = htole32(
200 f->seal * HEADER_COMPATIBLE_SEALED);
7560fffc 201
cec736d2
LP
202 r = sd_id128_randomize(&h.file_id);
203 if (r < 0)
204 return r;
205
0ac38b70
LP
206 if (template) {
207 h.seqnum_id = template->header->seqnum_id;
beec0085 208 h.tail_entry_seqnum = template->header->tail_entry_seqnum;
0ac38b70
LP
209 } else
210 h.seqnum_id = h.file_id;
cec736d2
LP
211
212 k = pwrite(f->fd, &h, sizeof(h), 0);
213 if (k < 0)
214 return -errno;
215
216 if (k != sizeof(h))
217 return -EIO;
218
219 return 0;
220}
221
222static int journal_file_refresh_header(JournalFile *f) {
de190aef 223 sd_id128_t boot_id;
fa6ac760 224 int r;
cec736d2
LP
225
226 assert(f);
227
228 r = sd_id128_get_machine(&f->header->machine_id);
229 if (r < 0)
230 return r;
231
de190aef 232 r = sd_id128_get_boot(&boot_id);
cec736d2
LP
233 if (r < 0)
234 return r;
235
de190aef
LP
236 if (sd_id128_equal(boot_id, f->header->boot_id))
237 f->tail_entry_monotonic_valid = true;
238
239 f->header->boot_id = boot_id;
240
fa6ac760 241 r = journal_file_set_online(f);
b788cc23 242
7560fffc 243 /* Sync the online state to disk */
a676e665 244 fsync(f->fd);
b788cc23 245
fa6ac760 246 return r;
cec736d2
LP
247}
248
249static int journal_file_verify_header(JournalFile *f) {
d89c8fdf
ZJS
250 uint32_t flags;
251
cec736d2
LP
252 assert(f);
253
7560fffc 254 if (memcmp(f->header->signature, HEADER_SIGNATURE, 8))
cec736d2
LP
255 return -EBADMSG;
256
7560fffc
LP
257 /* In both read and write mode we refuse to open files with
258 * incompatible flags we don't know */
d89c8fdf
ZJS
259 flags = le32toh(f->header->incompatible_flags);
260 if (flags & ~HEADER_INCOMPATIBLE_SUPPORTED) {
261 if (flags & ~HEADER_INCOMPATIBLE_ANY)
262 log_debug("Journal file %s has unknown incompatible flags %"PRIx32,
263 f->path, flags & ~HEADER_INCOMPATIBLE_ANY);
264 flags = (flags & HEADER_INCOMPATIBLE_ANY) & ~HEADER_INCOMPATIBLE_SUPPORTED;
265 if (flags)
266 log_debug("Journal file %s uses incompatible flags %"PRIx32
267 " disabled at compilation time.", f->path, flags);
cec736d2 268 return -EPROTONOSUPPORT;
d89c8fdf 269 }
cec736d2 270
7560fffc
LP
271 /* When open for writing we refuse to open files with
272 * compatible flags, too */
d89c8fdf
ZJS
273 flags = le32toh(f->header->compatible_flags);
274 if (f->writable && (flags & ~HEADER_COMPATIBLE_SUPPORTED)) {
275 if (flags & ~HEADER_COMPATIBLE_ANY)
276 log_debug("Journal file %s has unknown compatible flags %"PRIx32,
277 f->path, flags & ~HEADER_COMPATIBLE_ANY);
278 flags = (flags & HEADER_COMPATIBLE_ANY) & ~HEADER_COMPATIBLE_SUPPORTED;
279 if (flags)
280 log_debug("Journal file %s uses compatible flags %"PRIx32
281 " disabled at compilation time.", f->path, flags);
282 return -EPROTONOSUPPORT;
7560fffc
LP
283 }
284
db11ac1a
LP
285 if (f->header->state >= _STATE_MAX)
286 return -EBADMSG;
287
dca6219e
LP
288 /* The first addition was n_data, so check that we are at least this large */
289 if (le64toh(f->header->header_size) < HEADER_SIZE_MIN)
23b0b2b2
LP
290 return -EBADMSG;
291
8088cbd3 292 if (JOURNAL_HEADER_SEALED(f->header) && !JOURNAL_HEADER_CONTAINS(f->header, n_entry_arrays))
beec0085
LP
293 return -EBADMSG;
294
db11ac1a
LP
295 if ((le64toh(f->header->header_size) + le64toh(f->header->arena_size)) > (uint64_t) f->last_stat.st_size)
296 return -ENODATA;
297
298 if (le64toh(f->header->tail_object_offset) > (le64toh(f->header->header_size) + le64toh(f->header->arena_size)))
299 return -ENODATA;
300
7762e02b
LP
301 if (!VALID64(le64toh(f->header->data_hash_table_offset)) ||
302 !VALID64(le64toh(f->header->field_hash_table_offset)) ||
303 !VALID64(le64toh(f->header->tail_object_offset)) ||
304 !VALID64(le64toh(f->header->entry_array_offset)))
305 return -ENODATA;
306
cec736d2 307 if (f->writable) {
ccdbaf91 308 uint8_t state;
cec736d2
LP
309 sd_id128_t machine_id;
310 int r;
311
312 r = sd_id128_get_machine(&machine_id);
313 if (r < 0)
314 return r;
315
316 if (!sd_id128_equal(machine_id, f->header->machine_id))
317 return -EHOSTDOWN;
318
de190aef 319 state = f->header->state;
cec736d2 320
71fa6f00
LP
321 if (state == STATE_ONLINE) {
322 log_debug("Journal file %s is already online. Assuming unclean closing.", f->path);
323 return -EBUSY;
324 } else if (state == STATE_ARCHIVED)
cec736d2 325 return -ESHUTDOWN;
71fa6f00 326 else if (state != STATE_OFFLINE) {
8facc349 327 log_debug("Journal file %s has unknown state %i.", f->path, state);
71fa6f00
LP
328 return -EBUSY;
329 }
cec736d2
LP
330 }
331
d89c8fdf
ZJS
332 f->compress_xz = JOURNAL_HEADER_COMPRESSED_XZ(f->header);
333 f->compress_lz4 = JOURNAL_HEADER_COMPRESSED_LZ4(f->header);
c586dbf1 334
f1889c91 335 f->seal = JOURNAL_HEADER_SEALED(f->header);
7560fffc 336
cec736d2
LP
337 return 0;
338}
339
2678031a
LP
340static int journal_file_fstat(JournalFile *f) {
341 assert(f);
342 assert(f->fd >= 0);
343
344 if (fstat(f->fd, &f->last_stat) < 0)
345 return -errno;
346
347 f->last_stat_usec = now(CLOCK_MONOTONIC);
348
349 /* Refuse appending to files that are already deleted */
350 if (f->last_stat.st_nlink <= 0)
351 return -EIDRM;
352
353 return 0;
354}
355
cec736d2 356static int journal_file_allocate(JournalFile *f, uint64_t offset, uint64_t size) {
eda4b58b 357 uint64_t old_size, new_size;
fec2aa2f 358 int r;
cec736d2
LP
359
360 assert(f);
361
cec736d2 362 /* We assume that this file is not sparse, and we know that
38ac38b2 363 * for sure, since we always call posix_fallocate()
cec736d2
LP
364 * ourselves */
365
fa6ac760
LP
366 if (mmap_cache_got_sigbus(f->mmap, f->fd))
367 return -EIO;
368
cec736d2 369 old_size =
23b0b2b2 370 le64toh(f->header->header_size) +
cec736d2
LP
371 le64toh(f->header->arena_size);
372
bc85bfee 373 new_size = PAGE_ALIGN(offset + size);
23b0b2b2
LP
374 if (new_size < le64toh(f->header->header_size))
375 new_size = le64toh(f->header->header_size);
bc85bfee 376
2678031a
LP
377 if (new_size <= old_size) {
378
379 /* We already pre-allocated enough space, but before
380 * we write to it, let's check with fstat() if the
381 * file got deleted, in order make sure we don't throw
382 * away the data immediately. Don't check fstat() for
383 * all writes though, but only once ever 10s. */
384
385 if (f->last_stat_usec + LAST_STAT_REFRESH_USEC > now(CLOCK_MONOTONIC))
386 return 0;
387
388 return journal_file_fstat(f);
389 }
390
391 /* Allocate more space. */
cec736d2 392
a676e665 393 if (f->metrics.max_size > 0 && new_size > f->metrics.max_size)
bc85bfee 394 return -E2BIG;
cec736d2 395
a676e665 396 if (new_size > f->metrics.min_size && f->metrics.keep_free > 0) {
cec736d2
LP
397 struct statvfs svfs;
398
399 if (fstatvfs(f->fd, &svfs) >= 0) {
400 uint64_t available;
401
070052ab 402 available = LESS_BY((uint64_t) svfs.f_bfree * (uint64_t) svfs.f_bsize, f->metrics.keep_free);
cec736d2
LP
403
404 if (new_size - old_size > available)
405 return -E2BIG;
406 }
407 }
408
eda4b58b
LP
409 /* Increase by larger blocks at once */
410 new_size = ((new_size+FILE_SIZE_INCREASE-1) / FILE_SIZE_INCREASE) * FILE_SIZE_INCREASE;
411 if (f->metrics.max_size > 0 && new_size > f->metrics.max_size)
412 new_size = f->metrics.max_size;
413
bc85bfee
LP
414 /* Note that the glibc fallocate() fallback is very
415 inefficient, hence we try to minimize the allocation area
416 as we can. */
fec2aa2f
GV
417 r = posix_fallocate(f->fd, old_size, new_size - old_size);
418 if (r != 0)
419 return -r;
cec736d2 420
23b0b2b2 421 f->header->arena_size = htole64(new_size - le64toh(f->header->header_size));
cec736d2 422
2678031a 423 return journal_file_fstat(f);
cec736d2
LP
424}
425
78519831 426static unsigned type_to_context(ObjectType type) {
d3d3208f 427 /* One context for each type, plus one catch-all for the rest */
69adae51 428 assert_cc(_OBJECT_TYPE_MAX <= MMAP_CACHE_MAX_CONTEXTS);
fa6ac760 429 assert_cc(CONTEXT_HEADER < MMAP_CACHE_MAX_CONTEXTS);
d05089d8 430 return type > OBJECT_UNUSED && type < _OBJECT_TYPE_MAX ? type : 0;
d3d3208f
MS
431}
432
7a9dabea 433static int journal_file_move_to(JournalFile *f, ObjectType type, bool keep_always, uint64_t offset, uint64_t size, void **ret) {
2678031a
LP
434 int r;
435
cec736d2 436 assert(f);
cec736d2
LP
437 assert(ret);
438
7762e02b
LP
439 if (size <= 0)
440 return -EINVAL;
441
2a59ea54 442 /* Avoid SIGBUS on invalid accesses */
4bbdcdb3
LP
443 if (offset + size > (uint64_t) f->last_stat.st_size) {
444 /* Hmm, out of range? Let's refresh the fstat() data
445 * first, before we trust that check. */
446
2678031a
LP
447 r = journal_file_fstat(f);
448 if (r < 0)
449 return r;
450
451 if (offset + size > (uint64_t) f->last_stat.st_size)
4bbdcdb3
LP
452 return -EADDRNOTAVAIL;
453 }
454
7a9dabea 455 return mmap_cache_get(f->mmap, f->fd, f->prot, type_to_context(type), keep_always, offset, size, &f->last_stat, ret);
cec736d2
LP
456}
457
16e9f408
LP
458static uint64_t minimum_header_size(Object *o) {
459
b8e891e6 460 static const uint64_t table[] = {
16e9f408
LP
461 [OBJECT_DATA] = sizeof(DataObject),
462 [OBJECT_FIELD] = sizeof(FieldObject),
463 [OBJECT_ENTRY] = sizeof(EntryObject),
464 [OBJECT_DATA_HASH_TABLE] = sizeof(HashTableObject),
465 [OBJECT_FIELD_HASH_TABLE] = sizeof(HashTableObject),
466 [OBJECT_ENTRY_ARRAY] = sizeof(EntryArrayObject),
467 [OBJECT_TAG] = sizeof(TagObject),
468 };
469
470 if (o->object.type >= ELEMENTSOF(table) || table[o->object.type] <= 0)
471 return sizeof(ObjectHeader);
472
473 return table[o->object.type];
474}
475
78519831 476int journal_file_move_to_object(JournalFile *f, ObjectType type, uint64_t offset, Object **ret) {
cec736d2
LP
477 int r;
478 void *t;
479 Object *o;
480 uint64_t s;
481
482 assert(f);
483 assert(ret);
484
db11ac1a
LP
485 /* Objects may only be located at multiple of 64 bit */
486 if (!VALID64(offset))
487 return -EFAULT;
488
7a9dabea 489 r = journal_file_move_to(f, type, false, offset, sizeof(ObjectHeader), &t);
cec736d2
LP
490 if (r < 0)
491 return r;
492
493 o = (Object*) t;
494 s = le64toh(o->object.size);
495
496 if (s < sizeof(ObjectHeader))
497 return -EBADMSG;
498
16e9f408
LP
499 if (o->object.type <= OBJECT_UNUSED)
500 return -EBADMSG;
501
502 if (s < minimum_header_size(o))
503 return -EBADMSG;
504
d05089d8 505 if (type > OBJECT_UNUSED && o->object.type != type)
cec736d2
LP
506 return -EBADMSG;
507
508 if (s > sizeof(ObjectHeader)) {
7a9dabea 509 r = journal_file_move_to(f, type, false, offset, s, &t);
cec736d2
LP
510 if (r < 0)
511 return r;
512
513 o = (Object*) t;
514 }
515
cec736d2
LP
516 *ret = o;
517 return 0;
518}
519
d98cc1f2 520static uint64_t journal_file_entry_seqnum(JournalFile *f, uint64_t *seqnum) {
cec736d2
LP
521 uint64_t r;
522
523 assert(f);
524
beec0085 525 r = le64toh(f->header->tail_entry_seqnum) + 1;
c2373f84
LP
526
527 if (seqnum) {
de190aef 528 /* If an external seqnum counter was passed, we update
c2373f84
LP
529 * both the local and the external one, and set it to
530 * the maximum of both */
531
532 if (*seqnum + 1 > r)
533 r = *seqnum + 1;
534
535 *seqnum = r;
536 }
537
beec0085 538 f->header->tail_entry_seqnum = htole64(r);
cec736d2 539
beec0085
LP
540 if (f->header->head_entry_seqnum == 0)
541 f->header->head_entry_seqnum = htole64(r);
de190aef 542
cec736d2
LP
543 return r;
544}
545
78519831 546int journal_file_append_object(JournalFile *f, ObjectType type, uint64_t size, Object **ret, uint64_t *offset) {
cec736d2
LP
547 int r;
548 uint64_t p;
549 Object *tail, *o;
550 void *t;
551
552 assert(f);
d05089d8 553 assert(type > OBJECT_UNUSED && type < _OBJECT_TYPE_MAX);
cec736d2
LP
554 assert(size >= sizeof(ObjectHeader));
555 assert(offset);
556 assert(ret);
557
26687bf8
OS
558 r = journal_file_set_online(f);
559 if (r < 0)
560 return r;
561
cec736d2 562 p = le64toh(f->header->tail_object_offset);
cec736d2 563 if (p == 0)
23b0b2b2 564 p = le64toh(f->header->header_size);
cec736d2 565 else {
d05089d8 566 r = journal_file_move_to_object(f, OBJECT_UNUSED, p, &tail);
cec736d2
LP
567 if (r < 0)
568 return r;
569
570 p += ALIGN64(le64toh(tail->object.size));
571 }
572
573 r = journal_file_allocate(f, p, size);
574 if (r < 0)
575 return r;
576
fcde2389 577 r = journal_file_move_to(f, type, false, p, size, &t);
cec736d2
LP
578 if (r < 0)
579 return r;
580
581 o = (Object*) t;
582
583 zero(o->object);
de190aef 584 o->object.type = type;
cec736d2
LP
585 o->object.size = htole64(size);
586
587 f->header->tail_object_offset = htole64(p);
cec736d2
LP
588 f->header->n_objects = htole64(le64toh(f->header->n_objects) + 1);
589
590 *ret = o;
591 *offset = p;
592
593 return 0;
594}
595
de190aef 596static int journal_file_setup_data_hash_table(JournalFile *f) {
cec736d2
LP
597 uint64_t s, p;
598 Object *o;
599 int r;
600
601 assert(f);
602
070052ab
LP
603 /* We estimate that we need 1 hash table entry per 768 bytes
604 of journal file and we want to make sure we never get
605 beyond 75% fill level. Calculate the hash table size for
606 the maximum file size based on these metrics. */
4a92baf3 607
dfabe643 608 s = (f->metrics.max_size * 4 / 768 / 3) * sizeof(HashItem);
4a92baf3
LP
609 if (s < DEFAULT_DATA_HASH_TABLE_SIZE)
610 s = DEFAULT_DATA_HASH_TABLE_SIZE;
611
507f22bd 612 log_debug("Reserving %"PRIu64" entries in hash table.", s / sizeof(HashItem));
4a92baf3 613
de190aef
LP
614 r = journal_file_append_object(f,
615 OBJECT_DATA_HASH_TABLE,
616 offsetof(Object, hash_table.items) + s,
617 &o, &p);
cec736d2
LP
618 if (r < 0)
619 return r;
620
29804cc1 621 memzero(o->hash_table.items, s);
cec736d2 622
de190aef
LP
623 f->header->data_hash_table_offset = htole64(p + offsetof(Object, hash_table.items));
624 f->header->data_hash_table_size = htole64(s);
cec736d2
LP
625
626 return 0;
627}
628
de190aef 629static int journal_file_setup_field_hash_table(JournalFile *f) {
cec736d2
LP
630 uint64_t s, p;
631 Object *o;
632 int r;
633
634 assert(f);
635
3c1668da
LP
636 /* We use a fixed size hash table for the fields as this
637 * number should grow very slowly only */
638
de190aef
LP
639 s = DEFAULT_FIELD_HASH_TABLE_SIZE;
640 r = journal_file_append_object(f,
641 OBJECT_FIELD_HASH_TABLE,
642 offsetof(Object, hash_table.items) + s,
643 &o, &p);
cec736d2
LP
644 if (r < 0)
645 return r;
646
29804cc1 647 memzero(o->hash_table.items, s);
cec736d2 648
de190aef
LP
649 f->header->field_hash_table_offset = htole64(p + offsetof(Object, hash_table.items));
650 f->header->field_hash_table_size = htole64(s);
cec736d2
LP
651
652 return 0;
653}
654
dade37d4 655int journal_file_map_data_hash_table(JournalFile *f) {
cec736d2
LP
656 uint64_t s, p;
657 void *t;
658 int r;
659
660 assert(f);
661
dade37d4
LP
662 if (f->data_hash_table)
663 return 0;
664
de190aef
LP
665 p = le64toh(f->header->data_hash_table_offset);
666 s = le64toh(f->header->data_hash_table_size);
cec736d2 667
de190aef 668 r = journal_file_move_to(f,
16e9f408 669 OBJECT_DATA_HASH_TABLE,
fcde2389 670 true,
de190aef
LP
671 p, s,
672 &t);
cec736d2
LP
673 if (r < 0)
674 return r;
675
de190aef 676 f->data_hash_table = t;
cec736d2
LP
677 return 0;
678}
679
dade37d4 680int journal_file_map_field_hash_table(JournalFile *f) {
cec736d2
LP
681 uint64_t s, p;
682 void *t;
683 int r;
684
685 assert(f);
686
dade37d4
LP
687 if (f->field_hash_table)
688 return 0;
689
de190aef
LP
690 p = le64toh(f->header->field_hash_table_offset);
691 s = le64toh(f->header->field_hash_table_size);
cec736d2 692
de190aef 693 r = journal_file_move_to(f,
16e9f408 694 OBJECT_FIELD_HASH_TABLE,
fcde2389 695 true,
de190aef
LP
696 p, s,
697 &t);
cec736d2
LP
698 if (r < 0)
699 return r;
700
de190aef 701 f->field_hash_table = t;
cec736d2
LP
702 return 0;
703}
704
3c1668da
LP
705static int journal_file_link_field(
706 JournalFile *f,
707 Object *o,
708 uint64_t offset,
709 uint64_t hash) {
710
805d1486 711 uint64_t p, h, m;
3c1668da
LP
712 int r;
713
714 assert(f);
715 assert(o);
716 assert(offset > 0);
717
718 if (o->object.type != OBJECT_FIELD)
719 return -EINVAL;
720
805d1486
LP
721 m = le64toh(f->header->field_hash_table_size) / sizeof(HashItem);
722 if (m <= 0)
723 return -EBADMSG;
3c1668da 724
805d1486 725 /* This might alter the window we are looking at */
3c1668da
LP
726 o->field.next_hash_offset = o->field.head_data_offset = 0;
727
805d1486 728 h = hash % m;
3c1668da
LP
729 p = le64toh(f->field_hash_table[h].tail_hash_offset);
730 if (p == 0)
731 f->field_hash_table[h].head_hash_offset = htole64(offset);
732 else {
733 r = journal_file_move_to_object(f, OBJECT_FIELD, p, &o);
734 if (r < 0)
735 return r;
736
737 o->field.next_hash_offset = htole64(offset);
738 }
739
740 f->field_hash_table[h].tail_hash_offset = htole64(offset);
741
742 if (JOURNAL_HEADER_CONTAINS(f->header, n_fields))
743 f->header->n_fields = htole64(le64toh(f->header->n_fields) + 1);
744
745 return 0;
746}
747
748static int journal_file_link_data(
749 JournalFile *f,
750 Object *o,
751 uint64_t offset,
752 uint64_t hash) {
753
805d1486 754 uint64_t p, h, m;
cec736d2
LP
755 int r;
756
757 assert(f);
758 assert(o);
759 assert(offset > 0);
b588975f
LP
760
761 if (o->object.type != OBJECT_DATA)
762 return -EINVAL;
cec736d2 763
805d1486
LP
764 m = le64toh(f->header->data_hash_table_size) / sizeof(HashItem);
765 if (m <= 0)
766 return -EBADMSG;
48496df6 767
805d1486 768 /* This might alter the window we are looking at */
de190aef
LP
769 o->data.next_hash_offset = o->data.next_field_offset = 0;
770 o->data.entry_offset = o->data.entry_array_offset = 0;
771 o->data.n_entries = 0;
cec736d2 772
805d1486 773 h = hash % m;
8db4213e 774 p = le64toh(f->data_hash_table[h].tail_hash_offset);
3c1668da 775 if (p == 0)
cec736d2 776 /* Only entry in the hash table is easy */
de190aef 777 f->data_hash_table[h].head_hash_offset = htole64(offset);
3c1668da 778 else {
48496df6
LP
779 /* Move back to the previous data object, to patch in
780 * pointer */
cec736d2 781
de190aef 782 r = journal_file_move_to_object(f, OBJECT_DATA, p, &o);
cec736d2
LP
783 if (r < 0)
784 return r;
785
de190aef 786 o->data.next_hash_offset = htole64(offset);
cec736d2
LP
787 }
788
de190aef 789 f->data_hash_table[h].tail_hash_offset = htole64(offset);
cec736d2 790
dca6219e
LP
791 if (JOURNAL_HEADER_CONTAINS(f->header, n_data))
792 f->header->n_data = htole64(le64toh(f->header->n_data) + 1);
793
cec736d2
LP
794 return 0;
795}
796
3c1668da
LP
797int journal_file_find_field_object_with_hash(
798 JournalFile *f,
799 const void *field, uint64_t size, uint64_t hash,
800 Object **ret, uint64_t *offset) {
801
805d1486 802 uint64_t p, osize, h, m;
3c1668da
LP
803 int r;
804
805 assert(f);
806 assert(field && size > 0);
807
dade37d4
LP
808 /* If the field hash table is empty, we can't find anything */
809 if (le64toh(f->header->field_hash_table_size) <= 0)
810 return 0;
811
812 /* Map the field hash table, if it isn't mapped yet. */
813 r = journal_file_map_field_hash_table(f);
814 if (r < 0)
815 return r;
816
3c1668da
LP
817 osize = offsetof(Object, field.payload) + size;
818
805d1486 819 m = le64toh(f->header->field_hash_table_size) / sizeof(HashItem);
805d1486 820 if (m <= 0)
3c1668da
LP
821 return -EBADMSG;
822
805d1486 823 h = hash % m;
3c1668da
LP
824 p = le64toh(f->field_hash_table[h].head_hash_offset);
825
826 while (p > 0) {
827 Object *o;
828
829 r = journal_file_move_to_object(f, OBJECT_FIELD, p, &o);
830 if (r < 0)
831 return r;
832
833 if (le64toh(o->field.hash) == hash &&
834 le64toh(o->object.size) == osize &&
835 memcmp(o->field.payload, field, size) == 0) {
836
837 if (ret)
838 *ret = o;
839 if (offset)
840 *offset = p;
841
842 return 1;
843 }
844
845 p = le64toh(o->field.next_hash_offset);
846 }
847
848 return 0;
849}
850
851int journal_file_find_field_object(
852 JournalFile *f,
853 const void *field, uint64_t size,
854 Object **ret, uint64_t *offset) {
855
856 uint64_t hash;
857
858 assert(f);
859 assert(field && size > 0);
860
861 hash = hash64(field, size);
862
863 return journal_file_find_field_object_with_hash(f,
864 field, size, hash,
865 ret, offset);
866}
867
de190aef
LP
868int journal_file_find_data_object_with_hash(
869 JournalFile *f,
870 const void *data, uint64_t size, uint64_t hash,
871 Object **ret, uint64_t *offset) {
48496df6 872
805d1486 873 uint64_t p, osize, h, m;
cec736d2
LP
874 int r;
875
876 assert(f);
877 assert(data || size == 0);
878
dade37d4
LP
879 /* If there's no data hash table, then there's no entry. */
880 if (le64toh(f->header->data_hash_table_size) <= 0)
881 return 0;
882
883 /* Map the data hash table, if it isn't mapped yet. */
884 r = journal_file_map_data_hash_table(f);
885 if (r < 0)
886 return r;
887
cec736d2
LP
888 osize = offsetof(Object, data.payload) + size;
889
805d1486
LP
890 m = le64toh(f->header->data_hash_table_size) / sizeof(HashItem);
891 if (m <= 0)
bc85bfee
LP
892 return -EBADMSG;
893
805d1486 894 h = hash % m;
de190aef 895 p = le64toh(f->data_hash_table[h].head_hash_offset);
cec736d2 896
de190aef
LP
897 while (p > 0) {
898 Object *o;
cec736d2 899
de190aef 900 r = journal_file_move_to_object(f, OBJECT_DATA, p, &o);
cec736d2
LP
901 if (r < 0)
902 return r;
903
807e17f0 904 if (le64toh(o->data.hash) != hash)
85a131e8 905 goto next;
807e17f0 906
d89c8fdf 907 if (o->object.flags & OBJECT_COMPRESSION_MASK) {
3b1a55e1 908#if defined(HAVE_XZ) || defined(HAVE_LZ4)
fa1c4b51 909 uint64_t l;
a7f7d1bd 910 size_t rsize = 0;
cec736d2 911
807e17f0
LP
912 l = le64toh(o->object.size);
913 if (l <= offsetof(Object, data.payload))
cec736d2
LP
914 return -EBADMSG;
915
807e17f0
LP
916 l -= offsetof(Object, data.payload);
917
d89c8fdf
ZJS
918 r = decompress_blob(o->object.flags & OBJECT_COMPRESSION_MASK,
919 o->data.payload, l, &f->compress_buffer, &f->compress_buffer_size, &rsize, 0);
920 if (r < 0)
921 return r;
807e17f0 922
b785c858 923 if (rsize == size &&
807e17f0
LP
924 memcmp(f->compress_buffer, data, size) == 0) {
925
926 if (ret)
927 *ret = o;
928
929 if (offset)
930 *offset = p;
931
932 return 1;
933 }
3b1a55e1
ZJS
934#else
935 return -EPROTONOSUPPORT;
936#endif
807e17f0
LP
937 } else if (le64toh(o->object.size) == osize &&
938 memcmp(o->data.payload, data, size) == 0) {
939
cec736d2
LP
940 if (ret)
941 *ret = o;
942
943 if (offset)
944 *offset = p;
945
de190aef 946 return 1;
cec736d2
LP
947 }
948
85a131e8 949 next:
cec736d2
LP
950 p = le64toh(o->data.next_hash_offset);
951 }
952
de190aef
LP
953 return 0;
954}
955
956int journal_file_find_data_object(
957 JournalFile *f,
958 const void *data, uint64_t size,
959 Object **ret, uint64_t *offset) {
960
961 uint64_t hash;
962
963 assert(f);
964 assert(data || size == 0);
965
966 hash = hash64(data, size);
967
968 return journal_file_find_data_object_with_hash(f,
969 data, size, hash,
970 ret, offset);
971}
972
3c1668da
LP
973static int journal_file_append_field(
974 JournalFile *f,
975 const void *field, uint64_t size,
976 Object **ret, uint64_t *offset) {
977
978 uint64_t hash, p;
979 uint64_t osize;
980 Object *o;
981 int r;
982
983 assert(f);
984 assert(field && size > 0);
985
986 hash = hash64(field, size);
987
988 r = journal_file_find_field_object_with_hash(f, field, size, hash, &o, &p);
989 if (r < 0)
990 return r;
991 else if (r > 0) {
992
993 if (ret)
994 *ret = o;
995
996 if (offset)
997 *offset = p;
998
999 return 0;
1000 }
1001
1002 osize = offsetof(Object, field.payload) + size;
1003 r = journal_file_append_object(f, OBJECT_FIELD, osize, &o, &p);
8c92d4bb
LP
1004 if (r < 0)
1005 return r;
3c1668da
LP
1006
1007 o->field.hash = htole64(hash);
1008 memcpy(o->field.payload, field, size);
1009
1010 r = journal_file_link_field(f, o, p, hash);
1011 if (r < 0)
1012 return r;
1013
1014 /* The linking might have altered the window, so let's
1015 * refresh our pointer */
1016 r = journal_file_move_to_object(f, OBJECT_FIELD, p, &o);
1017 if (r < 0)
1018 return r;
1019
1020#ifdef HAVE_GCRYPT
1021 r = journal_file_hmac_put_object(f, OBJECT_FIELD, o, p);
1022 if (r < 0)
1023 return r;
1024#endif
1025
1026 if (ret)
1027 *ret = o;
1028
1029 if (offset)
1030 *offset = p;
1031
1032 return 0;
1033}
1034
48496df6
LP
1035static int journal_file_append_data(
1036 JournalFile *f,
1037 const void *data, uint64_t size,
1038 Object **ret, uint64_t *offset) {
1039
de190aef
LP
1040 uint64_t hash, p;
1041 uint64_t osize;
1042 Object *o;
d89c8fdf 1043 int r, compression = 0;
3c1668da 1044 const void *eq;
de190aef
LP
1045
1046 assert(f);
1047 assert(data || size == 0);
1048
1049 hash = hash64(data, size);
1050
1051 r = journal_file_find_data_object_with_hash(f, data, size, hash, &o, &p);
1052 if (r < 0)
1053 return r;
1054 else if (r > 0) {
1055
1056 if (ret)
1057 *ret = o;
1058
1059 if (offset)
1060 *offset = p;
1061
1062 return 0;
1063 }
1064
1065 osize = offsetof(Object, data.payload) + size;
1066 r = journal_file_append_object(f, OBJECT_DATA, osize, &o, &p);
cec736d2
LP
1067 if (r < 0)
1068 return r;
1069
cec736d2 1070 o->data.hash = htole64(hash);
807e17f0 1071
d89c8fdf
ZJS
1072#if defined(HAVE_XZ) || defined(HAVE_LZ4)
1073 if (f->compress_xz &&
807e17f0 1074 size >= COMPRESSION_SIZE_THRESHOLD) {
a7f7d1bd 1075 size_t rsize = 0;
807e17f0 1076
d89c8fdf 1077 compression = compress_blob(data, size, o->data.payload, &rsize);
807e17f0 1078
d89c8fdf 1079 if (compression) {
807e17f0 1080 o->object.size = htole64(offsetof(Object, data.payload) + rsize);
d89c8fdf 1081 o->object.flags |= compression;
807e17f0 1082
fa1c4b51 1083 log_debug("Compressed data object %"PRIu64" -> %zu using %s",
d89c8fdf 1084 size, rsize, object_compressed_to_string(compression));
807e17f0
LP
1085 }
1086 }
1087#endif
1088
d89c8fdf 1089 if (!compression && size > 0)
807e17f0 1090 memcpy(o->data.payload, data, size);
cec736d2 1091
de190aef 1092 r = journal_file_link_data(f, o, p, hash);
cec736d2
LP
1093 if (r < 0)
1094 return r;
1095
48496df6
LP
1096 /* The linking might have altered the window, so let's
1097 * refresh our pointer */
1098 r = journal_file_move_to_object(f, OBJECT_DATA, p, &o);
1099 if (r < 0)
1100 return r;
1101
08c6f819
SL
1102 if (!data)
1103 eq = NULL;
1104 else
1105 eq = memchr(data, '=', size);
3c1668da 1106 if (eq && eq > data) {
748db592 1107 Object *fo = NULL;
3c1668da 1108 uint64_t fp;
3c1668da
LP
1109
1110 /* Create field object ... */
1111 r = journal_file_append_field(f, data, (uint8_t*) eq - (uint8_t*) data, &fo, &fp);
1112 if (r < 0)
1113 return r;
1114
1115 /* ... and link it in. */
1116 o->data.next_field_offset = fo->field.head_data_offset;
1117 fo->field.head_data_offset = le64toh(p);
1118 }
1119
5996c7c2
LP
1120#ifdef HAVE_GCRYPT
1121 r = journal_file_hmac_put_object(f, OBJECT_DATA, o, p);
1122 if (r < 0)
1123 return r;
1124#endif
1125
cec736d2
LP
1126 if (ret)
1127 *ret = o;
1128
1129 if (offset)
de190aef 1130 *offset = p;
cec736d2
LP
1131
1132 return 0;
1133}
1134
1135uint64_t journal_file_entry_n_items(Object *o) {
1136 assert(o);
b588975f
LP
1137
1138 if (o->object.type != OBJECT_ENTRY)
1139 return 0;
cec736d2
LP
1140
1141 return (le64toh(o->object.size) - offsetof(Object, entry.items)) / sizeof(EntryItem);
1142}
1143
0284adc6 1144uint64_t journal_file_entry_array_n_items(Object *o) {
de190aef 1145 assert(o);
b588975f
LP
1146
1147 if (o->object.type != OBJECT_ENTRY_ARRAY)
1148 return 0;
de190aef
LP
1149
1150 return (le64toh(o->object.size) - offsetof(Object, entry_array.items)) / sizeof(uint64_t);
1151}
1152
fb9a24b6
LP
1153uint64_t journal_file_hash_table_n_items(Object *o) {
1154 assert(o);
b588975f
LP
1155
1156 if (o->object.type != OBJECT_DATA_HASH_TABLE &&
1157 o->object.type != OBJECT_FIELD_HASH_TABLE)
1158 return 0;
fb9a24b6
LP
1159
1160 return (le64toh(o->object.size) - offsetof(Object, hash_table.items)) / sizeof(HashItem);
1161}
1162
de190aef 1163static int link_entry_into_array(JournalFile *f,
4fd052ae
FC
1164 le64_t *first,
1165 le64_t *idx,
de190aef 1166 uint64_t p) {
cec736d2 1167 int r;
de190aef
LP
1168 uint64_t n = 0, ap = 0, q, i, a, hidx;
1169 Object *o;
1170
cec736d2 1171 assert(f);
de190aef
LP
1172 assert(first);
1173 assert(idx);
1174 assert(p > 0);
cec736d2 1175
de190aef
LP
1176 a = le64toh(*first);
1177 i = hidx = le64toh(*idx);
1178 while (a > 0) {
1179
1180 r = journal_file_move_to_object(f, OBJECT_ENTRY_ARRAY, a, &o);
1181 if (r < 0)
1182 return r;
cec736d2 1183
de190aef
LP
1184 n = journal_file_entry_array_n_items(o);
1185 if (i < n) {
1186 o->entry_array.items[i] = htole64(p);
1187 *idx = htole64(hidx + 1);
1188 return 0;
1189 }
cec736d2 1190
de190aef
LP
1191 i -= n;
1192 ap = a;
1193 a = le64toh(o->entry_array.next_entry_array_offset);
1194 }
1195
1196 if (hidx > n)
1197 n = (hidx+1) * 2;
1198 else
1199 n = n * 2;
1200
1201 if (n < 4)
1202 n = 4;
1203
1204 r = journal_file_append_object(f, OBJECT_ENTRY_ARRAY,
1205 offsetof(Object, entry_array.items) + n * sizeof(uint64_t),
1206 &o, &q);
cec736d2
LP
1207 if (r < 0)
1208 return r;
1209
feb12d3e 1210#ifdef HAVE_GCRYPT
5996c7c2 1211 r = journal_file_hmac_put_object(f, OBJECT_ENTRY_ARRAY, o, q);
b0af6f41
LP
1212 if (r < 0)
1213 return r;
feb12d3e 1214#endif
b0af6f41 1215
de190aef 1216 o->entry_array.items[i] = htole64(p);
cec736d2 1217
de190aef 1218 if (ap == 0)
7be3aa17 1219 *first = htole64(q);
cec736d2 1220 else {
de190aef 1221 r = journal_file_move_to_object(f, OBJECT_ENTRY_ARRAY, ap, &o);
cec736d2
LP
1222 if (r < 0)
1223 return r;
1224
de190aef
LP
1225 o->entry_array.next_entry_array_offset = htole64(q);
1226 }
cec736d2 1227
2dee23eb
LP
1228 if (JOURNAL_HEADER_CONTAINS(f->header, n_entry_arrays))
1229 f->header->n_entry_arrays = htole64(le64toh(f->header->n_entry_arrays) + 1);
1230
de190aef
LP
1231 *idx = htole64(hidx + 1);
1232
1233 return 0;
1234}
cec736d2 1235
de190aef 1236static int link_entry_into_array_plus_one(JournalFile *f,
4fd052ae
FC
1237 le64_t *extra,
1238 le64_t *first,
1239 le64_t *idx,
de190aef
LP
1240 uint64_t p) {
1241
1242 int r;
1243
1244 assert(f);
1245 assert(extra);
1246 assert(first);
1247 assert(idx);
1248 assert(p > 0);
1249
1250 if (*idx == 0)
1251 *extra = htole64(p);
1252 else {
4fd052ae 1253 le64_t i;
de190aef 1254
7be3aa17 1255 i = htole64(le64toh(*idx) - 1);
de190aef
LP
1256 r = link_entry_into_array(f, first, &i, p);
1257 if (r < 0)
1258 return r;
cec736d2
LP
1259 }
1260
de190aef
LP
1261 *idx = htole64(le64toh(*idx) + 1);
1262 return 0;
1263}
1264
1265static int journal_file_link_entry_item(JournalFile *f, Object *o, uint64_t offset, uint64_t i) {
1266 uint64_t p;
1267 int r;
1268 assert(f);
1269 assert(o);
1270 assert(offset > 0);
1271
1272 p = le64toh(o->entry.items[i].object_offset);
1273 if (p == 0)
1274 return -EINVAL;
1275
1276 r = journal_file_move_to_object(f, OBJECT_DATA, p, &o);
cec736d2
LP
1277 if (r < 0)
1278 return r;
1279
de190aef
LP
1280 return link_entry_into_array_plus_one(f,
1281 &o->data.entry_offset,
1282 &o->data.entry_array_offset,
1283 &o->data.n_entries,
1284 offset);
cec736d2
LP
1285}
1286
1287static int journal_file_link_entry(JournalFile *f, Object *o, uint64_t offset) {
de190aef 1288 uint64_t n, i;
cec736d2
LP
1289 int r;
1290
1291 assert(f);
1292 assert(o);
1293 assert(offset > 0);
b588975f
LP
1294
1295 if (o->object.type != OBJECT_ENTRY)
1296 return -EINVAL;
cec736d2 1297
b788cc23
LP
1298 __sync_synchronize();
1299
cec736d2 1300 /* Link up the entry itself */
de190aef
LP
1301 r = link_entry_into_array(f,
1302 &f->header->entry_array_offset,
1303 &f->header->n_entries,
1304 offset);
1305 if (r < 0)
1306 return r;
cec736d2 1307
507f22bd 1308 /* log_debug("=> %s seqnr=%"PRIu64" n_entries=%"PRIu64, f->path, o->entry.seqnum, f->header->n_entries); */
cec736d2 1309
de190aef 1310 if (f->header->head_entry_realtime == 0)
0ac38b70 1311 f->header->head_entry_realtime = o->entry.realtime;
cec736d2 1312
0ac38b70 1313 f->header->tail_entry_realtime = o->entry.realtime;
de190aef
LP
1314 f->header->tail_entry_monotonic = o->entry.monotonic;
1315
1316 f->tail_entry_monotonic_valid = true;
cec736d2
LP
1317
1318 /* Link up the items */
1319 n = journal_file_entry_n_items(o);
1320 for (i = 0; i < n; i++) {
1321 r = journal_file_link_entry_item(f, o, offset, i);
1322 if (r < 0)
1323 return r;
1324 }
1325
cec736d2
LP
1326 return 0;
1327}
1328
1329static int journal_file_append_entry_internal(
1330 JournalFile *f,
1331 const dual_timestamp *ts,
1332 uint64_t xor_hash,
1333 const EntryItem items[], unsigned n_items,
de190aef 1334 uint64_t *seqnum,
cec736d2
LP
1335 Object **ret, uint64_t *offset) {
1336 uint64_t np;
1337 uint64_t osize;
1338 Object *o;
1339 int r;
1340
1341 assert(f);
1342 assert(items || n_items == 0);
de190aef 1343 assert(ts);
cec736d2
LP
1344
1345 osize = offsetof(Object, entry.items) + (n_items * sizeof(EntryItem));
1346
de190aef 1347 r = journal_file_append_object(f, OBJECT_ENTRY, osize, &o, &np);
cec736d2
LP
1348 if (r < 0)
1349 return r;
1350
d98cc1f2 1351 o->entry.seqnum = htole64(journal_file_entry_seqnum(f, seqnum));
cec736d2 1352 memcpy(o->entry.items, items, n_items * sizeof(EntryItem));
de190aef
LP
1353 o->entry.realtime = htole64(ts->realtime);
1354 o->entry.monotonic = htole64(ts->monotonic);
cec736d2
LP
1355 o->entry.xor_hash = htole64(xor_hash);
1356 o->entry.boot_id = f->header->boot_id;
1357
feb12d3e 1358#ifdef HAVE_GCRYPT
5996c7c2 1359 r = journal_file_hmac_put_object(f, OBJECT_ENTRY, o, np);
b0af6f41
LP
1360 if (r < 0)
1361 return r;
feb12d3e 1362#endif
b0af6f41 1363
cec736d2
LP
1364 r = journal_file_link_entry(f, o, np);
1365 if (r < 0)
1366 return r;
1367
1368 if (ret)
1369 *ret = o;
1370
1371 if (offset)
1372 *offset = np;
1373
1374 return 0;
1375}
1376
cf244689 1377void journal_file_post_change(JournalFile *f) {
50f20cfd
LP
1378 assert(f);
1379
1380 /* inotify() does not receive IN_MODIFY events from file
1381 * accesses done via mmap(). After each access we hence
1382 * trigger IN_MODIFY by truncating the journal file to its
1383 * current size which triggers IN_MODIFY. */
1384
bc85bfee
LP
1385 __sync_synchronize();
1386
50f20cfd 1387 if (ftruncate(f->fd, f->last_stat.st_size) < 0)
56f64d95 1388 log_error_errno(errno, "Failed to truncate file to its own size: %m");
50f20cfd
LP
1389}
1390
1f2da9ec
LP
1391static int entry_item_cmp(const void *_a, const void *_b) {
1392 const EntryItem *a = _a, *b = _b;
1393
1394 if (le64toh(a->object_offset) < le64toh(b->object_offset))
1395 return -1;
1396 if (le64toh(a->object_offset) > le64toh(b->object_offset))
1397 return 1;
1398 return 0;
1399}
1400
de190aef 1401int journal_file_append_entry(JournalFile *f, const dual_timestamp *ts, const struct iovec iovec[], unsigned n_iovec, uint64_t *seqnum, Object **ret, uint64_t *offset) {
cec736d2
LP
1402 unsigned i;
1403 EntryItem *items;
1404 int r;
1405 uint64_t xor_hash = 0;
de190aef 1406 struct dual_timestamp _ts;
cec736d2
LP
1407
1408 assert(f);
1409 assert(iovec || n_iovec == 0);
1410
de190aef
LP
1411 if (!ts) {
1412 dual_timestamp_get(&_ts);
1413 ts = &_ts;
1414 }
1415
1416 if (f->tail_entry_monotonic_valid &&
1417 ts->monotonic < le64toh(f->header->tail_entry_monotonic))
1418 return -EINVAL;
1419
feb12d3e 1420#ifdef HAVE_GCRYPT
7560fffc
LP
1421 r = journal_file_maybe_append_tag(f, ts->realtime);
1422 if (r < 0)
1423 return r;
feb12d3e 1424#endif
7560fffc 1425
64825d3c 1426 /* alloca() can't take 0, hence let's allocate at least one */
9607d947 1427 items = alloca(sizeof(EntryItem) * MAX(1u, n_iovec));
cec736d2
LP
1428
1429 for (i = 0; i < n_iovec; i++) {
1430 uint64_t p;
1431 Object *o;
1432
1433 r = journal_file_append_data(f, iovec[i].iov_base, iovec[i].iov_len, &o, &p);
1434 if (r < 0)
cf244689 1435 return r;
cec736d2
LP
1436
1437 xor_hash ^= le64toh(o->data.hash);
1438 items[i].object_offset = htole64(p);
de7b95cd 1439 items[i].hash = o->data.hash;
cec736d2
LP
1440 }
1441
1f2da9ec
LP
1442 /* Order by the position on disk, in order to improve seek
1443 * times for rotating media. */
7ff7394d 1444 qsort_safe(items, n_iovec, sizeof(EntryItem), entry_item_cmp);
1f2da9ec 1445
de190aef 1446 r = journal_file_append_entry_internal(f, ts, xor_hash, items, n_iovec, seqnum, ret, offset);
cec736d2 1447
fa6ac760
LP
1448 /* If the memory mapping triggered a SIGBUS then we return an
1449 * IO error and ignore the error code passed down to us, since
1450 * it is very likely just an effect of a nullified replacement
1451 * mapping page */
1452
1453 if (mmap_cache_got_sigbus(f->mmap, f->fd))
1454 r = -EIO;
1455
50f20cfd
LP
1456 journal_file_post_change(f);
1457
cec736d2
LP
1458 return r;
1459}
1460
a4bcff5b 1461typedef struct ChainCacheItem {
fb099c8d 1462 uint64_t first; /* the array at the beginning of the chain */
a4bcff5b
LP
1463 uint64_t array; /* the cached array */
1464 uint64_t begin; /* the first item in the cached array */
1465 uint64_t total; /* the total number of items in all arrays before this one in the chain */
f268980d 1466 uint64_t last_index; /* the last index we looked at, to optimize locality when bisecting */
a4bcff5b
LP
1467} ChainCacheItem;
1468
1469static void chain_cache_put(
4743015d 1470 OrderedHashmap *h,
a4bcff5b
LP
1471 ChainCacheItem *ci,
1472 uint64_t first,
1473 uint64_t array,
1474 uint64_t begin,
f268980d
LP
1475 uint64_t total,
1476 uint64_t last_index) {
a4bcff5b
LP
1477
1478 if (!ci) {
34741aa3
LP
1479 /* If the chain item to cache for this chain is the
1480 * first one it's not worth caching anything */
1481 if (array == first)
1482 return;
1483
29433089 1484 if (ordered_hashmap_size(h) >= CHAIN_CACHE_MAX) {
4743015d 1485 ci = ordered_hashmap_steal_first(h);
29433089
LP
1486 assert(ci);
1487 } else {
a4bcff5b
LP
1488 ci = new(ChainCacheItem, 1);
1489 if (!ci)
1490 return;
1491 }
1492
1493 ci->first = first;
1494
4743015d 1495 if (ordered_hashmap_put(h, &ci->first, ci) < 0) {
a4bcff5b
LP
1496 free(ci);
1497 return;
1498 }
1499 } else
1500 assert(ci->first == first);
1501
1502 ci->array = array;
1503 ci->begin = begin;
1504 ci->total = total;
f268980d 1505 ci->last_index = last_index;
a4bcff5b
LP
1506}
1507
f268980d
LP
1508static int generic_array_get(
1509 JournalFile *f,
1510 uint64_t first,
1511 uint64_t i,
1512 Object **ret, uint64_t *offset) {
de190aef 1513
cec736d2 1514 Object *o;
a4bcff5b 1515 uint64_t p = 0, a, t = 0;
cec736d2 1516 int r;
a4bcff5b 1517 ChainCacheItem *ci;
cec736d2
LP
1518
1519 assert(f);
1520
de190aef 1521 a = first;
a4bcff5b
LP
1522
1523 /* Try the chain cache first */
4743015d 1524 ci = ordered_hashmap_get(f->chain_cache, &first);
a4bcff5b
LP
1525 if (ci && i > ci->total) {
1526 a = ci->array;
1527 i -= ci->total;
1528 t = ci->total;
1529 }
1530
de190aef 1531 while (a > 0) {
a4bcff5b 1532 uint64_t k;
cec736d2 1533
de190aef
LP
1534 r = journal_file_move_to_object(f, OBJECT_ENTRY_ARRAY, a, &o);
1535 if (r < 0)
1536 return r;
cec736d2 1537
a4bcff5b
LP
1538 k = journal_file_entry_array_n_items(o);
1539 if (i < k) {
de190aef 1540 p = le64toh(o->entry_array.items[i]);
a4bcff5b 1541 goto found;
cec736d2
LP
1542 }
1543
a4bcff5b
LP
1544 i -= k;
1545 t += k;
de190aef
LP
1546 a = le64toh(o->entry_array.next_entry_array_offset);
1547 }
1548
a4bcff5b
LP
1549 return 0;
1550
1551found:
1552 /* Let's cache this item for the next invocation */
af13a6b0 1553 chain_cache_put(f->chain_cache, ci, first, a, le64toh(o->entry_array.items[0]), t, i);
de190aef
LP
1554
1555 r = journal_file_move_to_object(f, OBJECT_ENTRY, p, &o);
1556 if (r < 0)
1557 return r;
1558
1559 if (ret)
1560 *ret = o;
1561
1562 if (offset)
1563 *offset = p;
1564
1565 return 1;
1566}
1567
f268980d
LP
1568static int generic_array_get_plus_one(
1569 JournalFile *f,
1570 uint64_t extra,
1571 uint64_t first,
1572 uint64_t i,
1573 Object **ret, uint64_t *offset) {
de190aef
LP
1574
1575 Object *o;
1576
1577 assert(f);
1578
1579 if (i == 0) {
1580 int r;
1581
1582 r = journal_file_move_to_object(f, OBJECT_ENTRY, extra, &o);
cec736d2
LP
1583 if (r < 0)
1584 return r;
1585
de190aef
LP
1586 if (ret)
1587 *ret = o;
cec736d2 1588
de190aef
LP
1589 if (offset)
1590 *offset = extra;
cec736d2 1591
de190aef 1592 return 1;
cec736d2
LP
1593 }
1594
de190aef
LP
1595 return generic_array_get(f, first, i-1, ret, offset);
1596}
cec736d2 1597
de190aef
LP
1598enum {
1599 TEST_FOUND,
1600 TEST_LEFT,
1601 TEST_RIGHT
1602};
cec736d2 1603
f268980d
LP
1604static int generic_array_bisect(
1605 JournalFile *f,
1606 uint64_t first,
1607 uint64_t n,
1608 uint64_t needle,
1609 int (*test_object)(JournalFile *f, uint64_t p, uint64_t needle),
1610 direction_t direction,
1611 Object **ret,
1612 uint64_t *offset,
1613 uint64_t *idx) {
1614
1615 uint64_t a, p, t = 0, i = 0, last_p = 0, last_index = (uint64_t) -1;
de190aef
LP
1616 bool subtract_one = false;
1617 Object *o, *array = NULL;
1618 int r;
a4bcff5b 1619 ChainCacheItem *ci;
cec736d2 1620
de190aef
LP
1621 assert(f);
1622 assert(test_object);
cec736d2 1623
a4bcff5b 1624 /* Start with the first array in the chain */
de190aef 1625 a = first;
a4bcff5b 1626
4743015d 1627 ci = ordered_hashmap_get(f->chain_cache, &first);
a4bcff5b
LP
1628 if (ci && n > ci->total) {
1629 /* Ah, we have iterated this bisection array chain
1630 * previously! Let's see if we can skip ahead in the
1631 * chain, as far as the last time. But we can't jump
1632 * backwards in the chain, so let's check that
1633 * first. */
1634
1635 r = test_object(f, ci->begin, needle);
1636 if (r < 0)
1637 return r;
1638
1639 if (r == TEST_LEFT) {
f268980d 1640 /* OK, what we are looking for is right of the
a4bcff5b
LP
1641 * begin of this EntryArray, so let's jump
1642 * straight to previously cached array in the
1643 * chain */
1644
1645 a = ci->array;
1646 n -= ci->total;
1647 t = ci->total;
f268980d 1648 last_index = ci->last_index;
a4bcff5b
LP
1649 }
1650 }
1651
de190aef
LP
1652 while (a > 0) {
1653 uint64_t left, right, k, lp;
1654
1655 r = journal_file_move_to_object(f, OBJECT_ENTRY_ARRAY, a, &array);
cec736d2
LP
1656 if (r < 0)
1657 return r;
1658
de190aef
LP
1659 k = journal_file_entry_array_n_items(array);
1660 right = MIN(k, n);
1661 if (right <= 0)
1662 return 0;
cec736d2 1663
de190aef
LP
1664 i = right - 1;
1665 lp = p = le64toh(array->entry_array.items[i]);
1666 if (p <= 0)
1667 return -EBADMSG;
cec736d2 1668
de190aef
LP
1669 r = test_object(f, p, needle);
1670 if (r < 0)
1671 return r;
cec736d2 1672
de190aef
LP
1673 if (r == TEST_FOUND)
1674 r = direction == DIRECTION_DOWN ? TEST_RIGHT : TEST_LEFT;
1675
1676 if (r == TEST_RIGHT) {
1677 left = 0;
1678 right -= 1;
f268980d
LP
1679
1680 if (last_index != (uint64_t) -1) {
1681 assert(last_index <= right);
1682
1683 /* If we cached the last index we
1684 * looked at, let's try to not to jump
1685 * too wildly around and see if we can
1686 * limit the range to look at early to
1687 * the immediate neighbors of the last
1688 * index we looked at. */
1689
1690 if (last_index > 0) {
1691 uint64_t x = last_index - 1;
1692
1693 p = le64toh(array->entry_array.items[x]);
1694 if (p <= 0)
1695 return -EBADMSG;
1696
1697 r = test_object(f, p, needle);
1698 if (r < 0)
1699 return r;
1700
1701 if (r == TEST_FOUND)
1702 r = direction == DIRECTION_DOWN ? TEST_RIGHT : TEST_LEFT;
1703
1704 if (r == TEST_RIGHT)
1705 right = x;
1706 else
1707 left = x + 1;
1708 }
1709
1710 if (last_index < right) {
1711 uint64_t y = last_index + 1;
1712
1713 p = le64toh(array->entry_array.items[y]);
1714 if (p <= 0)
1715 return -EBADMSG;
1716
1717 r = test_object(f, p, needle);
1718 if (r < 0)
1719 return r;
1720
1721 if (r == TEST_FOUND)
1722 r = direction == DIRECTION_DOWN ? TEST_RIGHT : TEST_LEFT;
1723
1724 if (r == TEST_RIGHT)
1725 right = y;
1726 else
1727 left = y + 1;
1728 }
f268980d
LP
1729 }
1730
de190aef
LP
1731 for (;;) {
1732 if (left == right) {
1733 if (direction == DIRECTION_UP)
1734 subtract_one = true;
1735
1736 i = left;
1737 goto found;
1738 }
1739
1740 assert(left < right);
de190aef 1741 i = (left + right) / 2;
f268980d 1742
de190aef
LP
1743 p = le64toh(array->entry_array.items[i]);
1744 if (p <= 0)
1745 return -EBADMSG;
1746
1747 r = test_object(f, p, needle);
1748 if (r < 0)
1749 return r;
cec736d2 1750
de190aef
LP
1751 if (r == TEST_FOUND)
1752 r = direction == DIRECTION_DOWN ? TEST_RIGHT : TEST_LEFT;
1753
1754 if (r == TEST_RIGHT)
1755 right = i;
1756 else
1757 left = i + 1;
1758 }
1759 }
1760
2173cbf8 1761 if (k >= n) {
cbdca852
LP
1762 if (direction == DIRECTION_UP) {
1763 i = n;
1764 subtract_one = true;
1765 goto found;
1766 }
1767
cec736d2 1768 return 0;
cbdca852 1769 }
cec736d2 1770
de190aef
LP
1771 last_p = lp;
1772
1773 n -= k;
1774 t += k;
f268980d 1775 last_index = (uint64_t) -1;
de190aef 1776 a = le64toh(array->entry_array.next_entry_array_offset);
cec736d2
LP
1777 }
1778
1779 return 0;
de190aef
LP
1780
1781found:
1782 if (subtract_one && t == 0 && i == 0)
1783 return 0;
1784
a4bcff5b 1785 /* Let's cache this item for the next invocation */
af13a6b0 1786 chain_cache_put(f->chain_cache, ci, first, a, le64toh(array->entry_array.items[0]), t, subtract_one ? (i > 0 ? i-1 : (uint64_t) -1) : i);
a4bcff5b 1787
de190aef
LP
1788 if (subtract_one && i == 0)
1789 p = last_p;
1790 else if (subtract_one)
1791 p = le64toh(array->entry_array.items[i-1]);
1792 else
1793 p = le64toh(array->entry_array.items[i]);
1794
1795 r = journal_file_move_to_object(f, OBJECT_ENTRY, p, &o);
1796 if (r < 0)
1797 return r;
1798
1799 if (ret)
1800 *ret = o;
1801
1802 if (offset)
1803 *offset = p;
1804
1805 if (idx)
cbdca852 1806 *idx = t + i + (subtract_one ? -1 : 0);
de190aef
LP
1807
1808 return 1;
cec736d2
LP
1809}
1810
f268980d
LP
1811static int generic_array_bisect_plus_one(
1812 JournalFile *f,
1813 uint64_t extra,
1814 uint64_t first,
1815 uint64_t n,
1816 uint64_t needle,
1817 int (*test_object)(JournalFile *f, uint64_t p, uint64_t needle),
1818 direction_t direction,
1819 Object **ret,
1820 uint64_t *offset,
1821 uint64_t *idx) {
de190aef 1822
cec736d2 1823 int r;
cbdca852
LP
1824 bool step_back = false;
1825 Object *o;
cec736d2
LP
1826
1827 assert(f);
de190aef 1828 assert(test_object);
cec736d2 1829
de190aef
LP
1830 if (n <= 0)
1831 return 0;
cec736d2 1832
de190aef
LP
1833 /* This bisects the array in object 'first', but first checks
1834 * an extra */
de190aef
LP
1835 r = test_object(f, extra, needle);
1836 if (r < 0)
1837 return r;
a536e261
LP
1838
1839 if (r == TEST_FOUND)
1840 r = direction == DIRECTION_DOWN ? TEST_RIGHT : TEST_LEFT;
1841
cbdca852
LP
1842 /* if we are looking with DIRECTION_UP then we need to first
1843 see if in the actual array there is a matching entry, and
1844 return the last one of that. But if there isn't any we need
1845 to return this one. Hence remember this, and return it
1846 below. */
1847 if (r == TEST_LEFT)
1848 step_back = direction == DIRECTION_UP;
de190aef 1849
cbdca852
LP
1850 if (r == TEST_RIGHT) {
1851 if (direction == DIRECTION_DOWN)
1852 goto found;
1853 else
1854 return 0;
a536e261 1855 }
cec736d2 1856
de190aef
LP
1857 r = generic_array_bisect(f, first, n-1, needle, test_object, direction, ret, offset, idx);
1858
cbdca852
LP
1859 if (r == 0 && step_back)
1860 goto found;
1861
ecf68b1d 1862 if (r > 0 && idx)
de190aef
LP
1863 (*idx) ++;
1864
1865 return r;
cbdca852
LP
1866
1867found:
1868 r = journal_file_move_to_object(f, OBJECT_ENTRY, extra, &o);
1869 if (r < 0)
1870 return r;
1871
1872 if (ret)
1873 *ret = o;
1874
1875 if (offset)
1876 *offset = extra;
1877
1878 if (idx)
1879 *idx = 0;
1880
1881 return 1;
1882}
1883
44a6b1b6 1884_pure_ static int test_object_offset(JournalFile *f, uint64_t p, uint64_t needle) {
cbdca852
LP
1885 assert(f);
1886 assert(p > 0);
1887
1888 if (p == needle)
1889 return TEST_FOUND;
1890 else if (p < needle)
1891 return TEST_LEFT;
1892 else
1893 return TEST_RIGHT;
1894}
1895
de190aef
LP
1896static int test_object_seqnum(JournalFile *f, uint64_t p, uint64_t needle) {
1897 Object *o;
1898 int r;
1899
1900 assert(f);
1901 assert(p > 0);
1902
1903 r = journal_file_move_to_object(f, OBJECT_ENTRY, p, &o);
cec736d2
LP
1904 if (r < 0)
1905 return r;
1906
de190aef
LP
1907 if (le64toh(o->entry.seqnum) == needle)
1908 return TEST_FOUND;
1909 else if (le64toh(o->entry.seqnum) < needle)
1910 return TEST_LEFT;
1911 else
1912 return TEST_RIGHT;
1913}
cec736d2 1914
de190aef
LP
1915int journal_file_move_to_entry_by_seqnum(
1916 JournalFile *f,
1917 uint64_t seqnum,
1918 direction_t direction,
1919 Object **ret,
1920 uint64_t *offset) {
1921
1922 return generic_array_bisect(f,
1923 le64toh(f->header->entry_array_offset),
1924 le64toh(f->header->n_entries),
1925 seqnum,
1926 test_object_seqnum,
1927 direction,
1928 ret, offset, NULL);
1929}
cec736d2 1930
de190aef
LP
1931static int test_object_realtime(JournalFile *f, uint64_t p, uint64_t needle) {
1932 Object *o;
1933 int r;
1934
1935 assert(f);
1936 assert(p > 0);
1937
1938 r = journal_file_move_to_object(f, OBJECT_ENTRY, p, &o);
1939 if (r < 0)
1940 return r;
1941
1942 if (le64toh(o->entry.realtime) == needle)
1943 return TEST_FOUND;
1944 else if (le64toh(o->entry.realtime) < needle)
1945 return TEST_LEFT;
1946 else
1947 return TEST_RIGHT;
cec736d2
LP
1948}
1949
de190aef
LP
1950int journal_file_move_to_entry_by_realtime(
1951 JournalFile *f,
1952 uint64_t realtime,
1953 direction_t direction,
1954 Object **ret,
1955 uint64_t *offset) {
1956
1957 return generic_array_bisect(f,
1958 le64toh(f->header->entry_array_offset),
1959 le64toh(f->header->n_entries),
1960 realtime,
1961 test_object_realtime,
1962 direction,
1963 ret, offset, NULL);
1964}
1965
1966static int test_object_monotonic(JournalFile *f, uint64_t p, uint64_t needle) {
1967 Object *o;
1968 int r;
1969
1970 assert(f);
1971 assert(p > 0);
1972
1973 r = journal_file_move_to_object(f, OBJECT_ENTRY, p, &o);
1974 if (r < 0)
1975 return r;
1976
1977 if (le64toh(o->entry.monotonic) == needle)
1978 return TEST_FOUND;
1979 else if (le64toh(o->entry.monotonic) < needle)
1980 return TEST_LEFT;
1981 else
1982 return TEST_RIGHT;
1983}
1984
2a560338 1985static int find_data_object_by_boot_id(
47838ab3
ZJS
1986 JournalFile *f,
1987 sd_id128_t boot_id,
1988 Object **o,
1989 uint64_t *b) {
2a560338 1990
47838ab3
ZJS
1991 char t[sizeof("_BOOT_ID=")-1 + 32 + 1] = "_BOOT_ID=";
1992
1993 sd_id128_to_string(boot_id, t + 9);
1994 return journal_file_find_data_object(f, t, sizeof(t) - 1, o, b);
1995}
1996
de190aef
LP
1997int journal_file_move_to_entry_by_monotonic(
1998 JournalFile *f,
1999 sd_id128_t boot_id,
2000 uint64_t monotonic,
2001 direction_t direction,
2002 Object **ret,
2003 uint64_t *offset) {
2004
de190aef
LP
2005 Object *o;
2006 int r;
2007
cbdca852 2008 assert(f);
de190aef 2009
47838ab3 2010 r = find_data_object_by_boot_id(f, boot_id, &o, NULL);
de190aef
LP
2011 if (r < 0)
2012 return r;
cbdca852 2013 if (r == 0)
de190aef
LP
2014 return -ENOENT;
2015
2016 return generic_array_bisect_plus_one(f,
2017 le64toh(o->data.entry_offset),
2018 le64toh(o->data.entry_array_offset),
2019 le64toh(o->data.n_entries),
2020 monotonic,
2021 test_object_monotonic,
2022 direction,
2023 ret, offset, NULL);
2024}
2025
1fc605b0 2026void journal_file_reset_location(JournalFile *f) {
6573ef05 2027 f->location_type = LOCATION_HEAD;
1fc605b0 2028 f->current_offset = 0;
6573ef05
MS
2029 f->current_seqnum = 0;
2030 f->current_realtime = 0;
2031 f->current_monotonic = 0;
2032 zero(f->current_boot_id);
2033 f->current_xor_hash = 0;
2034}
2035
950c07d4 2036void journal_file_save_location(JournalFile *f, Object *o, uint64_t offset) {
6573ef05
MS
2037 f->location_type = LOCATION_SEEK;
2038 f->current_offset = offset;
2039 f->current_seqnum = le64toh(o->entry.seqnum);
2040 f->current_realtime = le64toh(o->entry.realtime);
2041 f->current_monotonic = le64toh(o->entry.monotonic);
2042 f->current_boot_id = o->entry.boot_id;
2043 f->current_xor_hash = le64toh(o->entry.xor_hash);
1fc605b0
MS
2044}
2045
d8ae66d7
MS
2046int journal_file_compare_locations(JournalFile *af, JournalFile *bf) {
2047 assert(af);
2048 assert(bf);
2049 assert(af->location_type == LOCATION_SEEK);
2050 assert(bf->location_type == LOCATION_SEEK);
2051
2052 /* If contents and timestamps match, these entries are
2053 * identical, even if the seqnum does not match */
2054 if (sd_id128_equal(af->current_boot_id, bf->current_boot_id) &&
2055 af->current_monotonic == bf->current_monotonic &&
2056 af->current_realtime == bf->current_realtime &&
2057 af->current_xor_hash == bf->current_xor_hash)
2058 return 0;
2059
2060 if (sd_id128_equal(af->header->seqnum_id, bf->header->seqnum_id)) {
2061
2062 /* If this is from the same seqnum source, compare
2063 * seqnums */
2064 if (af->current_seqnum < bf->current_seqnum)
2065 return -1;
2066 if (af->current_seqnum > bf->current_seqnum)
2067 return 1;
2068
2069 /* Wow! This is weird, different data but the same
2070 * seqnums? Something is borked, but let's make the
2071 * best of it and compare by time. */
2072 }
2073
2074 if (sd_id128_equal(af->current_boot_id, bf->current_boot_id)) {
2075
2076 /* If the boot id matches, compare monotonic time */
2077 if (af->current_monotonic < bf->current_monotonic)
2078 return -1;
2079 if (af->current_monotonic > bf->current_monotonic)
2080 return 1;
2081 }
2082
2083 /* Otherwise, compare UTC time */
2084 if (af->current_realtime < bf->current_realtime)
2085 return -1;
2086 if (af->current_realtime > bf->current_realtime)
2087 return 1;
2088
2089 /* Finally, compare by contents */
2090 if (af->current_xor_hash < bf->current_xor_hash)
2091 return -1;
2092 if (af->current_xor_hash > bf->current_xor_hash)
2093 return 1;
2094
2095 return 0;
2096}
2097
de190aef
LP
2098int journal_file_next_entry(
2099 JournalFile *f,
f534928a 2100 uint64_t p,
de190aef
LP
2101 direction_t direction,
2102 Object **ret, uint64_t *offset) {
2103
fb099c8d 2104 uint64_t i, n, ofs;
cec736d2
LP
2105 int r;
2106
2107 assert(f);
de190aef
LP
2108
2109 n = le64toh(f->header->n_entries);
2110 if (n <= 0)
2111 return 0;
cec736d2 2112
f534928a 2113 if (p == 0)
de190aef 2114 i = direction == DIRECTION_DOWN ? 0 : n - 1;
cec736d2 2115 else {
de190aef
LP
2116 r = generic_array_bisect(f,
2117 le64toh(f->header->entry_array_offset),
2118 le64toh(f->header->n_entries),
2119 p,
2120 test_object_offset,
2121 DIRECTION_DOWN,
2122 NULL, NULL,
2123 &i);
2124 if (r <= 0)
2125 return r;
2126
2127 if (direction == DIRECTION_DOWN) {
2128 if (i >= n - 1)
2129 return 0;
2130
2131 i++;
2132 } else {
2133 if (i <= 0)
2134 return 0;
2135
2136 i--;
2137 }
cec736d2
LP
2138 }
2139
de190aef 2140 /* And jump to it */
fb099c8d
ZJS
2141 r = generic_array_get(f,
2142 le64toh(f->header->entry_array_offset),
2143 i,
2144 ret, &ofs);
2145 if (r <= 0)
2146 return r;
2147
2148 if (p > 0 &&
2149 (direction == DIRECTION_DOWN ? ofs <= p : ofs >= p)) {
2150 log_debug("%s: entry array corrupted at entry %"PRIu64,
2151 f->path, i);
2152 return -EBADMSG;
2153 }
2154
2155 if (offset)
2156 *offset = ofs;
2157
2158 return 1;
de190aef 2159}
cec736d2 2160
de190aef
LP
2161int journal_file_next_entry_for_data(
2162 JournalFile *f,
2163 Object *o, uint64_t p,
2164 uint64_t data_offset,
2165 direction_t direction,
2166 Object **ret, uint64_t *offset) {
2167
2168 uint64_t n, i;
cec736d2 2169 int r;
de190aef 2170 Object *d;
cec736d2
LP
2171
2172 assert(f);
de190aef 2173 assert(p > 0 || !o);
cec736d2 2174
de190aef 2175 r = journal_file_move_to_object(f, OBJECT_DATA, data_offset, &d);
466ccd92 2176 if (r < 0)
de190aef 2177 return r;
cec736d2 2178
de190aef
LP
2179 n = le64toh(d->data.n_entries);
2180 if (n <= 0)
2181 return n;
cec736d2 2182
de190aef
LP
2183 if (!o)
2184 i = direction == DIRECTION_DOWN ? 0 : n - 1;
2185 else {
2186 if (o->object.type != OBJECT_ENTRY)
2187 return -EINVAL;
cec736d2 2188
de190aef
LP
2189 r = generic_array_bisect_plus_one(f,
2190 le64toh(d->data.entry_offset),
2191 le64toh(d->data.entry_array_offset),
2192 le64toh(d->data.n_entries),
2193 p,
2194 test_object_offset,
2195 DIRECTION_DOWN,
2196 NULL, NULL,
2197 &i);
2198
2199 if (r <= 0)
cec736d2
LP
2200 return r;
2201
de190aef
LP
2202 if (direction == DIRECTION_DOWN) {
2203 if (i >= n - 1)
2204 return 0;
cec736d2 2205
de190aef
LP
2206 i++;
2207 } else {
2208 if (i <= 0)
2209 return 0;
cec736d2 2210
de190aef
LP
2211 i--;
2212 }
cec736d2 2213
de190aef 2214 }
cec736d2 2215
de190aef
LP
2216 return generic_array_get_plus_one(f,
2217 le64toh(d->data.entry_offset),
2218 le64toh(d->data.entry_array_offset),
2219 i,
2220 ret, offset);
2221}
cec736d2 2222
cbdca852
LP
2223int journal_file_move_to_entry_by_offset_for_data(
2224 JournalFile *f,
2225 uint64_t data_offset,
2226 uint64_t p,
2227 direction_t direction,
2228 Object **ret, uint64_t *offset) {
2229
2230 int r;
2231 Object *d;
2232
2233 assert(f);
2234
2235 r = journal_file_move_to_object(f, OBJECT_DATA, data_offset, &d);
2236 if (r < 0)
2237 return r;
2238
2239 return generic_array_bisect_plus_one(f,
2240 le64toh(d->data.entry_offset),
2241 le64toh(d->data.entry_array_offset),
2242 le64toh(d->data.n_entries),
2243 p,
2244 test_object_offset,
2245 direction,
2246 ret, offset, NULL);
2247}
2248
2249int journal_file_move_to_entry_by_monotonic_for_data(
2250 JournalFile *f,
2251 uint64_t data_offset,
2252 sd_id128_t boot_id,
2253 uint64_t monotonic,
2254 direction_t direction,
2255 Object **ret, uint64_t *offset) {
2256
cbdca852
LP
2257 Object *o, *d;
2258 int r;
2259 uint64_t b, z;
2260
2261 assert(f);
2262
2263 /* First, seek by time */
47838ab3 2264 r = find_data_object_by_boot_id(f, boot_id, &o, &b);
cbdca852
LP
2265 if (r < 0)
2266 return r;
2267 if (r == 0)
2268 return -ENOENT;
2269
2270 r = generic_array_bisect_plus_one(f,
2271 le64toh(o->data.entry_offset),
2272 le64toh(o->data.entry_array_offset),
2273 le64toh(o->data.n_entries),
2274 monotonic,
2275 test_object_monotonic,
2276 direction,
2277 NULL, &z, NULL);
2278 if (r <= 0)
2279 return r;
2280
2281 /* And now, continue seeking until we find an entry that
2282 * exists in both bisection arrays */
2283
2284 for (;;) {
2285 Object *qo;
2286 uint64_t p, q;
2287
2288 r = journal_file_move_to_object(f, OBJECT_DATA, data_offset, &d);
2289 if (r < 0)
2290 return r;
2291
2292 r = generic_array_bisect_plus_one(f,
2293 le64toh(d->data.entry_offset),
2294 le64toh(d->data.entry_array_offset),
2295 le64toh(d->data.n_entries),
2296 z,
2297 test_object_offset,
2298 direction,
2299 NULL, &p, NULL);
2300 if (r <= 0)
2301 return r;
2302
2303 r = journal_file_move_to_object(f, OBJECT_DATA, b, &o);
2304 if (r < 0)
2305 return r;
2306
2307 r = generic_array_bisect_plus_one(f,
2308 le64toh(o->data.entry_offset),
2309 le64toh(o->data.entry_array_offset),
2310 le64toh(o->data.n_entries),
2311 p,
2312 test_object_offset,
2313 direction,
2314 &qo, &q, NULL);
2315
2316 if (r <= 0)
2317 return r;
2318
2319 if (p == q) {
2320 if (ret)
2321 *ret = qo;
2322 if (offset)
2323 *offset = q;
2324
2325 return 1;
2326 }
2327
2328 z = q;
2329 }
cbdca852
LP
2330}
2331
de190aef
LP
2332int journal_file_move_to_entry_by_seqnum_for_data(
2333 JournalFile *f,
2334 uint64_t data_offset,
2335 uint64_t seqnum,
2336 direction_t direction,
2337 Object **ret, uint64_t *offset) {
cec736d2 2338
de190aef
LP
2339 Object *d;
2340 int r;
cec736d2 2341
91a31dde
LP
2342 assert(f);
2343
de190aef 2344 r = journal_file_move_to_object(f, OBJECT_DATA, data_offset, &d);
91a31dde 2345 if (r < 0)
de190aef 2346 return r;
cec736d2 2347
de190aef
LP
2348 return generic_array_bisect_plus_one(f,
2349 le64toh(d->data.entry_offset),
2350 le64toh(d->data.entry_array_offset),
2351 le64toh(d->data.n_entries),
2352 seqnum,
2353 test_object_seqnum,
2354 direction,
2355 ret, offset, NULL);
2356}
cec736d2 2357
de190aef
LP
2358int journal_file_move_to_entry_by_realtime_for_data(
2359 JournalFile *f,
2360 uint64_t data_offset,
2361 uint64_t realtime,
2362 direction_t direction,
2363 Object **ret, uint64_t *offset) {
2364
2365 Object *d;
2366 int r;
2367
91a31dde
LP
2368 assert(f);
2369
de190aef 2370 r = journal_file_move_to_object(f, OBJECT_DATA, data_offset, &d);
91a31dde 2371 if (r < 0)
de190aef
LP
2372 return r;
2373
2374 return generic_array_bisect_plus_one(f,
2375 le64toh(d->data.entry_offset),
2376 le64toh(d->data.entry_array_offset),
2377 le64toh(d->data.n_entries),
2378 realtime,
2379 test_object_realtime,
2380 direction,
2381 ret, offset, NULL);
cec736d2
LP
2382}
2383
0284adc6 2384void journal_file_dump(JournalFile *f) {
7560fffc 2385 Object *o;
7560fffc 2386 int r;
0284adc6 2387 uint64_t p;
7560fffc
LP
2388
2389 assert(f);
2390
0284adc6 2391 journal_file_print_header(f);
7560fffc 2392
0284adc6
LP
2393 p = le64toh(f->header->header_size);
2394 while (p != 0) {
d05089d8 2395 r = journal_file_move_to_object(f, OBJECT_UNUSED, p, &o);
0284adc6
LP
2396 if (r < 0)
2397 goto fail;
7560fffc 2398
0284adc6 2399 switch (o->object.type) {
d98cc1f2 2400
0284adc6
LP
2401 case OBJECT_UNUSED:
2402 printf("Type: OBJECT_UNUSED\n");
2403 break;
d98cc1f2 2404
0284adc6
LP
2405 case OBJECT_DATA:
2406 printf("Type: OBJECT_DATA\n");
2407 break;
7560fffc 2408
3c1668da
LP
2409 case OBJECT_FIELD:
2410 printf("Type: OBJECT_FIELD\n");
2411 break;
2412
0284adc6 2413 case OBJECT_ENTRY:
507f22bd
ZJS
2414 printf("Type: OBJECT_ENTRY seqnum=%"PRIu64" monotonic=%"PRIu64" realtime=%"PRIu64"\n",
2415 le64toh(o->entry.seqnum),
2416 le64toh(o->entry.monotonic),
2417 le64toh(o->entry.realtime));
0284adc6 2418 break;
7560fffc 2419
0284adc6
LP
2420 case OBJECT_FIELD_HASH_TABLE:
2421 printf("Type: OBJECT_FIELD_HASH_TABLE\n");
2422 break;
7560fffc 2423
0284adc6
LP
2424 case OBJECT_DATA_HASH_TABLE:
2425 printf("Type: OBJECT_DATA_HASH_TABLE\n");
2426 break;
7560fffc 2427
0284adc6
LP
2428 case OBJECT_ENTRY_ARRAY:
2429 printf("Type: OBJECT_ENTRY_ARRAY\n");
2430 break;
7560fffc 2431
0284adc6 2432 case OBJECT_TAG:
507f22bd
ZJS
2433 printf("Type: OBJECT_TAG seqnum=%"PRIu64" epoch=%"PRIu64"\n",
2434 le64toh(o->tag.seqnum),
2435 le64toh(o->tag.epoch));
0284adc6 2436 break;
3c1668da
LP
2437
2438 default:
8facc349 2439 printf("Type: unknown (%i)\n", o->object.type);
3c1668da 2440 break;
0284adc6 2441 }
7560fffc 2442
d89c8fdf
ZJS
2443 if (o->object.flags & OBJECT_COMPRESSION_MASK)
2444 printf("Flags: %s\n",
2445 object_compressed_to_string(o->object.flags & OBJECT_COMPRESSION_MASK));
7560fffc 2446
0284adc6
LP
2447 if (p == le64toh(f->header->tail_object_offset))
2448 p = 0;
2449 else
2450 p = p + ALIGN64(le64toh(o->object.size));
2451 }
7560fffc 2452
0284adc6
LP
2453 return;
2454fail:
2455 log_error("File corrupt");
7560fffc
LP
2456}
2457
718fe4b1
ZJS
2458static const char* format_timestamp_safe(char *buf, size_t l, usec_t t) {
2459 const char *x;
2460
2461 x = format_timestamp(buf, l, t);
2462 if (x)
2463 return x;
2464 return " --- ";
2465}
2466
0284adc6 2467void journal_file_print_header(JournalFile *f) {
2765b7bb 2468 char a[33], b[33], c[33], d[33];
ed375beb 2469 char x[FORMAT_TIMESTAMP_MAX], y[FORMAT_TIMESTAMP_MAX], z[FORMAT_TIMESTAMP_MAX];
a1a03e30
LP
2470 struct stat st;
2471 char bytes[FORMAT_BYTES_MAX];
7560fffc
LP
2472
2473 assert(f);
7560fffc 2474
0284adc6
LP
2475 printf("File Path: %s\n"
2476 "File ID: %s\n"
2477 "Machine ID: %s\n"
2478 "Boot ID: %s\n"
2479 "Sequential Number ID: %s\n"
2480 "State: %s\n"
2481 "Compatible Flags:%s%s\n"
d89c8fdf 2482 "Incompatible Flags:%s%s%s\n"
507f22bd
ZJS
2483 "Header size: %"PRIu64"\n"
2484 "Arena size: %"PRIu64"\n"
2485 "Data Hash Table Size: %"PRIu64"\n"
2486 "Field Hash Table Size: %"PRIu64"\n"
0284adc6 2487 "Rotate Suggested: %s\n"
507f22bd
ZJS
2488 "Head Sequential Number: %"PRIu64"\n"
2489 "Tail Sequential Number: %"PRIu64"\n"
0284adc6 2490 "Head Realtime Timestamp: %s\n"
3223f44f 2491 "Tail Realtime Timestamp: %s\n"
ed375beb 2492 "Tail Monotonic Timestamp: %s\n"
507f22bd
ZJS
2493 "Objects: %"PRIu64"\n"
2494 "Entry Objects: %"PRIu64"\n",
0284adc6
LP
2495 f->path,
2496 sd_id128_to_string(f->header->file_id, a),
2497 sd_id128_to_string(f->header->machine_id, b),
2498 sd_id128_to_string(f->header->boot_id, c),
2765b7bb 2499 sd_id128_to_string(f->header->seqnum_id, d),
3223f44f
LP
2500 f->header->state == STATE_OFFLINE ? "OFFLINE" :
2501 f->header->state == STATE_ONLINE ? "ONLINE" :
2502 f->header->state == STATE_ARCHIVED ? "ARCHIVED" : "UNKNOWN",
8088cbd3 2503 JOURNAL_HEADER_SEALED(f->header) ? " SEALED" : "",
d89c8fdf
ZJS
2504 (le32toh(f->header->compatible_flags) & ~HEADER_COMPATIBLE_ANY) ? " ???" : "",
2505 JOURNAL_HEADER_COMPRESSED_XZ(f->header) ? " COMPRESSED-XZ" : "",
2506 JOURNAL_HEADER_COMPRESSED_LZ4(f->header) ? " COMPRESSED-LZ4" : "",
2507 (le32toh(f->header->incompatible_flags) & ~HEADER_INCOMPATIBLE_ANY) ? " ???" : "",
507f22bd
ZJS
2508 le64toh(f->header->header_size),
2509 le64toh(f->header->arena_size),
2510 le64toh(f->header->data_hash_table_size) / sizeof(HashItem),
2511 le64toh(f->header->field_hash_table_size) / sizeof(HashItem),
fb0951b0 2512 yes_no(journal_file_rotate_suggested(f, 0)),
507f22bd
ZJS
2513 le64toh(f->header->head_entry_seqnum),
2514 le64toh(f->header->tail_entry_seqnum),
718fe4b1
ZJS
2515 format_timestamp_safe(x, sizeof(x), le64toh(f->header->head_entry_realtime)),
2516 format_timestamp_safe(y, sizeof(y), le64toh(f->header->tail_entry_realtime)),
ed375beb 2517 format_timespan(z, sizeof(z), le64toh(f->header->tail_entry_monotonic), USEC_PER_MSEC),
507f22bd
ZJS
2518 le64toh(f->header->n_objects),
2519 le64toh(f->header->n_entries));
7560fffc 2520
0284adc6 2521 if (JOURNAL_HEADER_CONTAINS(f->header, n_data))
507f22bd 2522 printf("Data Objects: %"PRIu64"\n"
0284adc6 2523 "Data Hash Table Fill: %.1f%%\n",
507f22bd 2524 le64toh(f->header->n_data),
0284adc6 2525 100.0 * (double) le64toh(f->header->n_data) / ((double) (le64toh(f->header->data_hash_table_size) / sizeof(HashItem))));
7560fffc 2526
0284adc6 2527 if (JOURNAL_HEADER_CONTAINS(f->header, n_fields))
507f22bd 2528 printf("Field Objects: %"PRIu64"\n"
0284adc6 2529 "Field Hash Table Fill: %.1f%%\n",
507f22bd 2530 le64toh(f->header->n_fields),
0284adc6 2531 100.0 * (double) le64toh(f->header->n_fields) / ((double) (le64toh(f->header->field_hash_table_size) / sizeof(HashItem))));
3223f44f
LP
2532
2533 if (JOURNAL_HEADER_CONTAINS(f->header, n_tags))
507f22bd
ZJS
2534 printf("Tag Objects: %"PRIu64"\n",
2535 le64toh(f->header->n_tags));
3223f44f 2536 if (JOURNAL_HEADER_CONTAINS(f->header, n_entry_arrays))
507f22bd
ZJS
2537 printf("Entry Array Objects: %"PRIu64"\n",
2538 le64toh(f->header->n_entry_arrays));
a1a03e30
LP
2539
2540 if (fstat(f->fd, &st) >= 0)
59f448cf 2541 printf("Disk usage: %s\n", format_bytes(bytes, sizeof(bytes), (uint64_t) st.st_blocks * 512ULL));
7560fffc
LP
2542}
2543
fc68c929
LP
2544static int journal_file_warn_btrfs(JournalFile *f) {
2545 unsigned attrs;
2546 int r;
2547
2548 assert(f);
2549
2550 /* Before we write anything, check if the COW logic is turned
2551 * off on btrfs. Given our write pattern that is quite
2552 * unfriendly to COW file systems this should greatly improve
2553 * performance on COW file systems, such as btrfs, at the
2554 * expense of data integrity features (which shouldn't be too
2555 * bad, given that we do our own checksumming). */
2556
2557 r = btrfs_is_filesystem(f->fd);
2558 if (r < 0)
2559 return log_warning_errno(r, "Failed to determine if journal is on btrfs: %m");
2560 if (!r)
2561 return 0;
2562
2563 r = read_attr_fd(f->fd, &attrs);
2564 if (r < 0)
2565 return log_warning_errno(r, "Failed to read file attributes: %m");
2566
2567 if (attrs & FS_NOCOW_FL) {
2568 log_debug("Detected btrfs file system with copy-on-write disabled, all is good.");
2569 return 0;
2570 }
2571
2572 log_notice("Creating journal file %s on a btrfs file system, and copy-on-write is enabled. "
2573 "This is likely to slow down journal access substantially, please consider turning "
2574 "off the copy-on-write file attribute on the journal directory, using chattr +C.", f->path);
2575
2576 return 1;
2577}
2578
0284adc6
LP
2579int journal_file_open(
2580 const char *fname,
2581 int flags,
2582 mode_t mode,
2583 bool compress,
baed47c3 2584 bool seal,
0284adc6
LP
2585 JournalMetrics *metrics,
2586 MMapCache *mmap_cache,
2587 JournalFile *template,
2588 JournalFile **ret) {
7560fffc 2589
fa6ac760 2590 bool newly_created = false;
0284adc6 2591 JournalFile *f;
fa6ac760 2592 void *h;
0284adc6 2593 int r;
7560fffc 2594
0284adc6 2595 assert(fname);
0559d3a5 2596 assert(ret);
7560fffc 2597
0284adc6
LP
2598 if ((flags & O_ACCMODE) != O_RDONLY &&
2599 (flags & O_ACCMODE) != O_RDWR)
2600 return -EINVAL;
7560fffc 2601
a0108012
LP
2602 if (!endswith(fname, ".journal") &&
2603 !endswith(fname, ".journal~"))
0284adc6 2604 return -EINVAL;
7560fffc 2605
0284adc6
LP
2606 f = new0(JournalFile, 1);
2607 if (!f)
2608 return -ENOMEM;
7560fffc 2609
0284adc6
LP
2610 f->fd = -1;
2611 f->mode = mode;
7560fffc 2612
0284adc6
LP
2613 f->flags = flags;
2614 f->prot = prot_from_flags(flags);
2615 f->writable = (flags & O_ACCMODE) != O_RDONLY;
92261977 2616#if defined(HAVE_LZ4)
d89c8fdf
ZJS
2617 f->compress_lz4 = compress;
2618#elif defined(HAVE_XZ)
2619 f->compress_xz = compress;
48b61739 2620#endif
49a32d43 2621#ifdef HAVE_GCRYPT
baed47c3 2622 f->seal = seal;
49a32d43 2623#endif
7560fffc 2624
0284adc6
LP
2625 if (mmap_cache)
2626 f->mmap = mmap_cache_ref(mmap_cache);
2627 else {
84168d80 2628 f->mmap = mmap_cache_new();
0284adc6
LP
2629 if (!f->mmap) {
2630 r = -ENOMEM;
2631 goto fail;
2632 }
2633 }
7560fffc 2634
0284adc6
LP
2635 f->path = strdup(fname);
2636 if (!f->path) {
2637 r = -ENOMEM;
2638 goto fail;
2639 }
7560fffc 2640
4743015d 2641 f->chain_cache = ordered_hashmap_new(&uint64_hash_ops);
a4bcff5b
LP
2642 if (!f->chain_cache) {
2643 r = -ENOMEM;
2644 goto fail;
2645 }
2646
0284adc6
LP
2647 f->fd = open(f->path, f->flags|O_CLOEXEC, f->mode);
2648 if (f->fd < 0) {
2649 r = -errno;
2650 goto fail;
7560fffc 2651 }
7560fffc 2652
2678031a
LP
2653 r = journal_file_fstat(f);
2654 if (r < 0)
0284adc6 2655 goto fail;
7560fffc 2656
0284adc6 2657 if (f->last_stat.st_size == 0 && f->writable) {
11689d2a 2658
fc68c929 2659 (void) journal_file_warn_btrfs(f);
11689d2a 2660
fb0951b0
LP
2661 /* Let's attach the creation time to the journal file,
2662 * so that the vacuuming code knows the age of this
2663 * file even if the file might end up corrupted one
2664 * day... Ideally we'd just use the creation time many
2665 * file systems maintain for each file, but there is
2666 * currently no usable API to query this, hence let's
2667 * emulate this via extended attributes. If extended
2668 * attributes are not supported we'll just skip this,
7517e174 2669 * and rely solely on mtime/atime/ctime of the file. */
fb0951b0 2670
d61b600d 2671 fd_setcrtime(f->fd, 0);
7560fffc 2672
feb12d3e 2673#ifdef HAVE_GCRYPT
0284adc6 2674 /* Try to load the FSPRG state, and if we can't, then
baed47c3 2675 * just don't do sealing */
49a32d43
LP
2676 if (f->seal) {
2677 r = journal_file_fss_load(f);
2678 if (r < 0)
2679 f->seal = false;
2680 }
feb12d3e 2681#endif
7560fffc 2682
0284adc6
LP
2683 r = journal_file_init_header(f, template);
2684 if (r < 0)
2685 goto fail;
7560fffc 2686
2678031a
LP
2687 r = journal_file_fstat(f);
2688 if (r < 0)
0284adc6 2689 goto fail;
fb0951b0
LP
2690
2691 newly_created = true;
0284adc6 2692 }
7560fffc 2693
0284adc6
LP
2694 if (f->last_stat.st_size < (off_t) HEADER_SIZE_MIN) {
2695 r = -EIO;
2696 goto fail;
2697 }
7560fffc 2698
fa6ac760 2699 r = mmap_cache_get(f->mmap, f->fd, f->prot, CONTEXT_HEADER, true, 0, PAGE_ALIGN(sizeof(Header)), &f->last_stat, &h);
977eaa1e 2700 if (r < 0)
0284adc6 2701 goto fail;
7560fffc 2702
fa6ac760
LP
2703 f->header = h;
2704
0284adc6
LP
2705 if (!newly_created) {
2706 r = journal_file_verify_header(f);
2707 if (r < 0)
2708 goto fail;
2709 }
7560fffc 2710
feb12d3e 2711#ifdef HAVE_GCRYPT
0284adc6 2712 if (!newly_created && f->writable) {
baed47c3 2713 r = journal_file_fss_load(f);
0284adc6
LP
2714 if (r < 0)
2715 goto fail;
2716 }
feb12d3e 2717#endif
cec736d2
LP
2718
2719 if (f->writable) {
4a92baf3
LP
2720 if (metrics) {
2721 journal_default_metrics(metrics, f->fd);
2722 f->metrics = *metrics;
2723 } else if (template)
2724 f->metrics = template->metrics;
2725
cec736d2
LP
2726 r = journal_file_refresh_header(f);
2727 if (r < 0)
2728 goto fail;
2729 }
2730
feb12d3e 2731#ifdef HAVE_GCRYPT
baed47c3 2732 r = journal_file_hmac_setup(f);
14d10188
LP
2733 if (r < 0)
2734 goto fail;
feb12d3e 2735#endif
14d10188 2736
cec736d2 2737 if (newly_created) {
de190aef 2738 r = journal_file_setup_field_hash_table(f);
cec736d2
LP
2739 if (r < 0)
2740 goto fail;
2741
de190aef 2742 r = journal_file_setup_data_hash_table(f);
cec736d2
LP
2743 if (r < 0)
2744 goto fail;
7560fffc 2745
feb12d3e 2746#ifdef HAVE_GCRYPT
7560fffc
LP
2747 r = journal_file_append_first_tag(f);
2748 if (r < 0)
2749 goto fail;
feb12d3e 2750#endif
cec736d2
LP
2751 }
2752
fa6ac760
LP
2753 if (mmap_cache_got_sigbus(f->mmap, f->fd)) {
2754 r = -EIO;
2755 goto fail;
2756 }
2757
0559d3a5 2758 *ret = f;
cec736d2
LP
2759 return 0;
2760
2761fail:
fa6ac760
LP
2762 if (f->fd >= 0 && mmap_cache_got_sigbus(f->mmap, f->fd))
2763 r = -EIO;
2764
cec736d2
LP
2765 journal_file_close(f);
2766
2767 return r;
2768}
0ac38b70 2769
baed47c3 2770int journal_file_rotate(JournalFile **f, bool compress, bool seal) {
57535f47 2771 _cleanup_free_ char *p = NULL;
0ac38b70
LP
2772 size_t l;
2773 JournalFile *old_file, *new_file = NULL;
2774 int r;
2775
2776 assert(f);
2777 assert(*f);
2778
2779 old_file = *f;
2780
2781 if (!old_file->writable)
2782 return -EINVAL;
2783
2784 if (!endswith(old_file->path, ".journal"))
2785 return -EINVAL;
2786
2787 l = strlen(old_file->path);
57535f47
ZJS
2788 r = asprintf(&p, "%.*s@" SD_ID128_FORMAT_STR "-%016"PRIx64"-%016"PRIx64".journal",
2789 (int) l - 8, old_file->path,
2790 SD_ID128_FORMAT_VAL(old_file->header->seqnum_id),
2791 le64toh((*f)->header->head_entry_seqnum),
2792 le64toh((*f)->header->head_entry_realtime));
2793 if (r < 0)
0ac38b70
LP
2794 return -ENOMEM;
2795
2678031a
LP
2796 /* Try to rename the file to the archived version. If the file
2797 * already was deleted, we'll get ENOENT, let's ignore that
2798 * case. */
0ac38b70 2799 r = rename(old_file->path, p);
2678031a 2800 if (r < 0 && errno != ENOENT)
0ac38b70
LP
2801 return -errno;
2802
ccdbaf91 2803 old_file->header->state = STATE_ARCHIVED;
0ac38b70 2804
f27a3864
LP
2805 /* Currently, btrfs is not very good with out write patterns
2806 * and fragments heavily. Let's defrag our journal files when
2807 * we archive them */
2808 old_file->defrag_on_close = true;
2809
baed47c3 2810 r = journal_file_open(old_file->path, old_file->flags, old_file->mode, compress, seal, NULL, old_file->mmap, old_file, &new_file);
0ac38b70
LP
2811 journal_file_close(old_file);
2812
2813 *f = new_file;
2814 return r;
2815}
2816
9447a7f1
LP
2817int journal_file_open_reliably(
2818 const char *fname,
2819 int flags,
2820 mode_t mode,
7560fffc 2821 bool compress,
baed47c3 2822 bool seal,
4a92baf3 2823 JournalMetrics *metrics,
27370278 2824 MMapCache *mmap_cache,
9447a7f1
LP
2825 JournalFile *template,
2826 JournalFile **ret) {
2827
2828 int r;
2829 size_t l;
ed375beb 2830 _cleanup_free_ char *p = NULL;
9447a7f1 2831
070052ab 2832 r = journal_file_open(fname, flags, mode, compress, seal, metrics, mmap_cache, template, ret);
288359db
ZJS
2833 if (!IN_SET(r,
2834 -EBADMSG, /* corrupted */
2835 -ENODATA, /* truncated */
2836 -EHOSTDOWN, /* other machine */
2837 -EPROTONOSUPPORT, /* incompatible feature */
2838 -EBUSY, /* unclean shutdown */
2839 -ESHUTDOWN, /* already archived */
2840 -EIO, /* IO error, including SIGBUS on mmap */
2841 -EIDRM /* File has been deleted */))
9447a7f1
LP
2842 return r;
2843
2844 if ((flags & O_ACCMODE) == O_RDONLY)
2845 return r;
2846
2847 if (!(flags & O_CREAT))
2848 return r;
2849
7560fffc
LP
2850 if (!endswith(fname, ".journal"))
2851 return r;
2852
5c70eab4
LP
2853 /* The file is corrupted. Rotate it away and try it again (but only once) */
2854
9447a7f1 2855 l = strlen(fname);
d587eca5 2856 if (asprintf(&p, "%.*s@%016"PRIx64 "-%016"PRIx64 ".journal~",
57535f47 2857 (int) l - 8, fname,
d587eca5 2858 now(CLOCK_REALTIME),
9bf3b535 2859 random_u64()) < 0)
9447a7f1
LP
2860 return -ENOMEM;
2861
65089b82 2862 if (rename(fname, p) < 0)
9447a7f1
LP
2863 return -errno;
2864
f27a3864
LP
2865 /* btrfs doesn't cope well with our write pattern and
2866 * fragments heavily. Let's defrag all files we rotate */
11689d2a
LP
2867
2868 (void) chattr_path(p, false, FS_NOCOW_FL);
f27a3864
LP
2869 (void) btrfs_defrag(p);
2870
65089b82 2871 log_warning_errno(r, "File %s corrupted or uncleanly shut down, renaming and replacing.", fname);
9447a7f1 2872
070052ab 2873 return journal_file_open(fname, flags, mode, compress, seal, metrics, mmap_cache, template, ret);
9447a7f1
LP
2874}
2875
cf244689
LP
2876int journal_file_copy_entry(JournalFile *from, JournalFile *to, Object *o, uint64_t p, uint64_t *seqnum, Object **ret, uint64_t *offset) {
2877 uint64_t i, n;
2878 uint64_t q, xor_hash = 0;
2879 int r;
2880 EntryItem *items;
2881 dual_timestamp ts;
2882
2883 assert(from);
2884 assert(to);
2885 assert(o);
2886 assert(p);
2887
2888 if (!to->writable)
2889 return -EPERM;
2890
2891 ts.monotonic = le64toh(o->entry.monotonic);
2892 ts.realtime = le64toh(o->entry.realtime);
2893
cf244689 2894 n = journal_file_entry_n_items(o);
4faa7004
TA
2895 /* alloca() can't take 0, hence let's allocate at least one */
2896 items = alloca(sizeof(EntryItem) * MAX(1u, n));
cf244689
LP
2897
2898 for (i = 0; i < n; i++) {
4fd052ae
FC
2899 uint64_t l, h;
2900 le64_t le_hash;
cf244689
LP
2901 size_t t;
2902 void *data;
2903 Object *u;
2904
2905 q = le64toh(o->entry.items[i].object_offset);
2906 le_hash = o->entry.items[i].hash;
2907
2908 r = journal_file_move_to_object(from, OBJECT_DATA, q, &o);
2909 if (r < 0)
2910 return r;
2911
2912 if (le_hash != o->data.hash)
2913 return -EBADMSG;
2914
2915 l = le64toh(o->object.size) - offsetof(Object, data.payload);
2916 t = (size_t) l;
2917
2918 /* We hit the limit on 32bit machines */
2919 if ((uint64_t) t != l)
2920 return -E2BIG;
2921
d89c8fdf 2922 if (o->object.flags & OBJECT_COMPRESSION_MASK) {
3b1a55e1 2923#if defined(HAVE_XZ) || defined(HAVE_LZ4)
a7f7d1bd 2924 size_t rsize = 0;
cf244689 2925
d89c8fdf
ZJS
2926 r = decompress_blob(o->object.flags & OBJECT_COMPRESSION_MASK,
2927 o->data.payload, l, &from->compress_buffer, &from->compress_buffer_size, &rsize, 0);
2928 if (r < 0)
2929 return r;
cf244689
LP
2930
2931 data = from->compress_buffer;
2932 l = rsize;
3b1a55e1
ZJS
2933#else
2934 return -EPROTONOSUPPORT;
2935#endif
cf244689
LP
2936 } else
2937 data = o->data.payload;
2938
2939 r = journal_file_append_data(to, data, l, &u, &h);
2940 if (r < 0)
2941 return r;
2942
2943 xor_hash ^= le64toh(u->data.hash);
2944 items[i].object_offset = htole64(h);
2945 items[i].hash = u->data.hash;
2946
2947 r = journal_file_move_to_object(from, OBJECT_ENTRY, p, &o);
2948 if (r < 0)
2949 return r;
2950 }
2951
fa6ac760
LP
2952 r = journal_file_append_entry_internal(to, &ts, xor_hash, items, n, seqnum, ret, offset);
2953
2954 if (mmap_cache_got_sigbus(to->mmap, to->fd))
2955 return -EIO;
2956
2957 return r;
cf244689 2958}
babfc091
LP
2959
2960void journal_default_metrics(JournalMetrics *m, int fd) {
2961 uint64_t fs_size = 0;
2962 struct statvfs ss;
a7bc2c2a 2963 char a[FORMAT_BYTES_MAX], b[FORMAT_BYTES_MAX], c[FORMAT_BYTES_MAX], d[FORMAT_BYTES_MAX];
babfc091
LP
2964
2965 assert(m);
2966 assert(fd >= 0);
2967
2968 if (fstatvfs(fd, &ss) >= 0)
2969 fs_size = ss.f_frsize * ss.f_blocks;
2970
2971 if (m->max_use == (uint64_t) -1) {
2972
2973 if (fs_size > 0) {
2974 m->max_use = PAGE_ALIGN(fs_size / 10); /* 10% of file system size */
2975
2976 if (m->max_use > DEFAULT_MAX_USE_UPPER)
2977 m->max_use = DEFAULT_MAX_USE_UPPER;
2978
2979 if (m->max_use < DEFAULT_MAX_USE_LOWER)
2980 m->max_use = DEFAULT_MAX_USE_LOWER;
2981 } else
2982 m->max_use = DEFAULT_MAX_USE_LOWER;
2983 } else {
2984 m->max_use = PAGE_ALIGN(m->max_use);
2985
2986 if (m->max_use < JOURNAL_FILE_SIZE_MIN*2)
2987 m->max_use = JOURNAL_FILE_SIZE_MIN*2;
2988 }
2989
2990 if (m->max_size == (uint64_t) -1) {
2991 m->max_size = PAGE_ALIGN(m->max_use / 8); /* 8 chunks */
2992
2993 if (m->max_size > DEFAULT_MAX_SIZE_UPPER)
2994 m->max_size = DEFAULT_MAX_SIZE_UPPER;
2995 } else
2996 m->max_size = PAGE_ALIGN(m->max_size);
2997
2998 if (m->max_size < JOURNAL_FILE_SIZE_MIN)
2999 m->max_size = JOURNAL_FILE_SIZE_MIN;
3000
3001 if (m->max_size*2 > m->max_use)
3002 m->max_use = m->max_size*2;
3003
3004 if (m->min_size == (uint64_t) -1)
3005 m->min_size = JOURNAL_FILE_SIZE_MIN;
3006 else {
3007 m->min_size = PAGE_ALIGN(m->min_size);
3008
3009 if (m->min_size < JOURNAL_FILE_SIZE_MIN)
3010 m->min_size = JOURNAL_FILE_SIZE_MIN;
3011
3012 if (m->min_size > m->max_size)
3013 m->max_size = m->min_size;
3014 }
3015
3016 if (m->keep_free == (uint64_t) -1) {
3017
3018 if (fs_size > 0) {
8621b110 3019 m->keep_free = PAGE_ALIGN(fs_size * 3 / 20); /* 15% of file system size */
babfc091
LP
3020
3021 if (m->keep_free > DEFAULT_KEEP_FREE_UPPER)
3022 m->keep_free = DEFAULT_KEEP_FREE_UPPER;
3023
3024 } else
3025 m->keep_free = DEFAULT_KEEP_FREE;
3026 }
3027
2b43f939
LP
3028 log_debug("Fixed max_use=%s max_size=%s min_size=%s keep_free=%s",
3029 format_bytes(a, sizeof(a), m->max_use),
3030 format_bytes(b, sizeof(b), m->max_size),
3031 format_bytes(c, sizeof(c), m->min_size),
3032 format_bytes(d, sizeof(d), m->keep_free));
babfc091 3033}
08984293
LP
3034
3035int journal_file_get_cutoff_realtime_usec(JournalFile *f, usec_t *from, usec_t *to) {
08984293
LP
3036 assert(f);
3037 assert(from || to);
3038
3039 if (from) {
162566a4
LP
3040 if (f->header->head_entry_realtime == 0)
3041 return -ENOENT;
08984293 3042
162566a4 3043 *from = le64toh(f->header->head_entry_realtime);
08984293
LP
3044 }
3045
3046 if (to) {
162566a4
LP
3047 if (f->header->tail_entry_realtime == 0)
3048 return -ENOENT;
08984293 3049
162566a4 3050 *to = le64toh(f->header->tail_entry_realtime);
08984293
LP
3051 }
3052
3053 return 1;
3054}
3055
3056int journal_file_get_cutoff_monotonic_usec(JournalFile *f, sd_id128_t boot_id, usec_t *from, usec_t *to) {
08984293
LP
3057 Object *o;
3058 uint64_t p;
3059 int r;
3060
3061 assert(f);
3062 assert(from || to);
3063
47838ab3 3064 r = find_data_object_by_boot_id(f, boot_id, &o, &p);
08984293
LP
3065 if (r <= 0)
3066 return r;
3067
3068 if (le64toh(o->data.n_entries) <= 0)
3069 return 0;
3070
3071 if (from) {
3072 r = journal_file_move_to_object(f, OBJECT_ENTRY, le64toh(o->data.entry_offset), &o);
3073 if (r < 0)
3074 return r;
3075
3076 *from = le64toh(o->entry.monotonic);
3077 }
3078
3079 if (to) {
3080 r = journal_file_move_to_object(f, OBJECT_DATA, p, &o);
3081 if (r < 0)
3082 return r;
3083
3084 r = generic_array_get_plus_one(f,
3085 le64toh(o->data.entry_offset),
3086 le64toh(o->data.entry_array_offset),
3087 le64toh(o->data.n_entries)-1,
3088 &o, NULL);
3089 if (r <= 0)
3090 return r;
3091
3092 *to = le64toh(o->entry.monotonic);
3093 }
3094
3095 return 1;
3096}
dca6219e 3097
fb0951b0 3098bool journal_file_rotate_suggested(JournalFile *f, usec_t max_file_usec) {
dca6219e
LP
3099 assert(f);
3100
3101 /* If we gained new header fields we gained new features,
3102 * hence suggest a rotation */
361f9cbc
LP
3103 if (le64toh(f->header->header_size) < sizeof(Header)) {
3104 log_debug("%s uses an outdated header, suggesting rotation.", f->path);
dca6219e 3105 return true;
361f9cbc 3106 }
dca6219e
LP
3107
3108 /* Let's check if the hash tables grew over a certain fill
3109 * level (75%, borrowing this value from Java's hash table
3110 * implementation), and if so suggest a rotation. To calculate
3111 * the fill level we need the n_data field, which only exists
3112 * in newer versions. */
3113
3114 if (JOURNAL_HEADER_CONTAINS(f->header, n_data))
361f9cbc 3115 if (le64toh(f->header->n_data) * 4ULL > (le64toh(f->header->data_hash_table_size) / sizeof(HashItem)) * 3ULL) {
507f22bd 3116 log_debug("Data hash table of %s has a fill level at %.1f (%"PRIu64" of %"PRIu64" items, %llu file size, %"PRIu64" bytes per hash table item), suggesting rotation.",
361f9cbc
LP
3117 f->path,
3118 100.0 * (double) le64toh(f->header->n_data) / ((double) (le64toh(f->header->data_hash_table_size) / sizeof(HashItem))),
507f22bd
ZJS
3119 le64toh(f->header->n_data),
3120 le64toh(f->header->data_hash_table_size) / sizeof(HashItem),
3121 (unsigned long long) f->last_stat.st_size,
3122 f->last_stat.st_size / le64toh(f->header->n_data));
dca6219e 3123 return true;
361f9cbc 3124 }
dca6219e
LP
3125
3126 if (JOURNAL_HEADER_CONTAINS(f->header, n_fields))
361f9cbc 3127 if (le64toh(f->header->n_fields) * 4ULL > (le64toh(f->header->field_hash_table_size) / sizeof(HashItem)) * 3ULL) {
507f22bd 3128 log_debug("Field hash table of %s has a fill level at %.1f (%"PRIu64" of %"PRIu64" items), suggesting rotation.",
361f9cbc
LP
3129 f->path,
3130 100.0 * (double) le64toh(f->header->n_fields) / ((double) (le64toh(f->header->field_hash_table_size) / sizeof(HashItem))),
507f22bd
ZJS
3131 le64toh(f->header->n_fields),
3132 le64toh(f->header->field_hash_table_size) / sizeof(HashItem));
dca6219e 3133 return true;
361f9cbc 3134 }
dca6219e 3135
0598fd4a
LP
3136 /* Are the data objects properly indexed by field objects? */
3137 if (JOURNAL_HEADER_CONTAINS(f->header, n_data) &&
3138 JOURNAL_HEADER_CONTAINS(f->header, n_fields) &&
3139 le64toh(f->header->n_data) > 0 &&
3140 le64toh(f->header->n_fields) == 0)
3141 return true;
3142
fb0951b0
LP
3143 if (max_file_usec > 0) {
3144 usec_t t, h;
3145
3146 h = le64toh(f->header->head_entry_realtime);
3147 t = now(CLOCK_REALTIME);
3148
3149 if (h > 0 && t > h + max_file_usec)
3150 return true;
3151 }
3152
dca6219e
LP
3153 return false;
3154}