]> git.ipfire.org Git - thirdparty/systemd.git/blame - src/journal/journal-file.c
Merge pull request #2096 from teg/resolved-cache
[thirdparty/systemd.git] / src / journal / journal-file.c
CommitLineData
cec736d2
LP
1/*-*- Mode: C; c-basic-offset: 8; indent-tabs-mode: nil -*-*/
2
3/***
4 This file is part of systemd.
5
6 Copyright 2011 Lennart Poettering
7
8 systemd is free software; you can redistribute it and/or modify it
5430f7f2
LP
9 under the terms of the GNU Lesser General Public License as published by
10 the Free Software Foundation; either version 2.1 of the License, or
cec736d2
LP
11 (at your option) any later version.
12
13 systemd is distributed in the hope that it will be useful, but
14 WITHOUT ANY WARRANTY; without even the implied warranty of
15 MERCHANTABILITY or FITNESS FOR A PARTICULAR PURPOSE. See the GNU
5430f7f2 16 Lesser General Public License for more details.
cec736d2 17
5430f7f2 18 You should have received a copy of the GNU Lesser General Public License
cec736d2
LP
19 along with systemd; If not, see <http://www.gnu.org/licenses/>.
20***/
21
cec736d2 22#include <errno.h>
cec736d2 23#include <fcntl.h>
11689d2a 24#include <linux/fs.h>
07630cea
LP
25#include <stddef.h>
26#include <sys/mman.h>
27#include <sys/statvfs.h>
28#include <sys/uio.h>
29#include <unistd.h>
fb0951b0 30
b5efdb8a 31#include "alloc-util.h"
f27a3864 32#include "btrfs-util.h"
c8b3094d 33#include "chattr-util.h"
07630cea 34#include "compress.h"
3ffd4af2 35#include "fd-util.h"
0284adc6 36#include "journal-authenticate.h"
cec736d2
LP
37#include "journal-def.h"
38#include "journal-file.h"
39#include "lookup3.h"
6bedfcbb 40#include "parse-util.h"
3df3e884 41#include "random-util.h"
07630cea 42#include "string-util.h"
89a5a90c 43#include "xattr-util.h"
cec736d2 44
4a92baf3
LP
45#define DEFAULT_DATA_HASH_TABLE_SIZE (2047ULL*sizeof(HashItem))
46#define DEFAULT_FIELD_HASH_TABLE_SIZE (333ULL*sizeof(HashItem))
cec736d2 47
be19b7df 48#define COMPRESSION_SIZE_THRESHOLD (512ULL)
807e17f0 49
babfc091 50/* This is the minimum journal file size */
16098e93 51#define JOURNAL_FILE_SIZE_MIN (512ULL*1024ULL) /* 512 KiB */
babfc091
LP
52
53/* These are the lower and upper bounds if we deduce the max_use value
54 * from the file system size */
55#define DEFAULT_MAX_USE_LOWER (1ULL*1024ULL*1024ULL) /* 1 MiB */
56#define DEFAULT_MAX_USE_UPPER (4ULL*1024ULL*1024ULL*1024ULL) /* 4 GiB */
57
8580d1f7
LP
58/* This is the default minimal use limit, how much we'll use even if keep_free suggests otherwise. */
59#define DEFAULT_MIN_USE (1ULL*1024ULL*1024ULL) /* 1 MiB */
60
babfc091 61/* This is the upper bound if we deduce max_size from max_use */
71100051 62#define DEFAULT_MAX_SIZE_UPPER (128ULL*1024ULL*1024ULL) /* 128 MiB */
babfc091
LP
63
64/* This is the upper bound if we deduce the keep_free value from the
65 * file system size */
66#define DEFAULT_KEEP_FREE_UPPER (4ULL*1024ULL*1024ULL*1024ULL) /* 4 GiB */
67
68/* This is the keep_free value when we can't determine the system
69 * size */
70#define DEFAULT_KEEP_FREE (1024ULL*1024ULL) /* 1 MB */
71
8580d1f7
LP
72/* This is the default maximum number of journal files to keep around. */
73#define DEFAULT_N_MAX_FILES (100)
74
dca6219e
LP
75/* n_data was the first entry we added after the initial file format design */
76#define HEADER_SIZE_MIN ALIGN64(offsetof(Header, n_data))
cec736d2 77
a4bcff5b
LP
78/* How many entries to keep in the entry array chain cache at max */
79#define CHAIN_CACHE_MAX 20
80
a676e665
LP
81/* How much to increase the journal file size at once each time we allocate something new. */
82#define FILE_SIZE_INCREASE (8ULL*1024ULL*1024ULL) /* 8MB */
83
2678031a
LP
84/* Reread fstat() of the file for detecting deletions at least this often */
85#define LAST_STAT_REFRESH_USEC (5*USEC_PER_SEC)
86
fa6ac760
LP
87/* The mmap context to use for the header we pick as one above the last defined typed */
88#define CONTEXT_HEADER _OBJECT_TYPE_MAX
89
9588bc32 90static int journal_file_set_online(JournalFile *f) {
26687bf8
OS
91 assert(f);
92
93 if (!f->writable)
94 return -EPERM;
95
96 if (!(f->fd >= 0 && f->header))
97 return -EINVAL;
98
fa6ac760
LP
99 if (mmap_cache_got_sigbus(f->mmap, f->fd))
100 return -EIO;
101
26687bf8
OS
102 switch(f->header->state) {
103 case STATE_ONLINE:
104 return 0;
105
106 case STATE_OFFLINE:
107 f->header->state = STATE_ONLINE;
108 fsync(f->fd);
109 return 0;
110
111 default:
112 return -EINVAL;
113 }
114}
115
116int journal_file_set_offline(JournalFile *f) {
117 assert(f);
118
119 if (!f->writable)
120 return -EPERM;
121
122 if (!(f->fd >= 0 && f->header))
123 return -EINVAL;
124
125 if (f->header->state != STATE_ONLINE)
126 return 0;
127
128 fsync(f->fd);
129
fa6ac760
LP
130 if (mmap_cache_got_sigbus(f->mmap, f->fd))
131 return -EIO;
132
26687bf8
OS
133 f->header->state = STATE_OFFLINE;
134
fa6ac760
LP
135 if (mmap_cache_got_sigbus(f->mmap, f->fd))
136 return -EIO;
137
26687bf8
OS
138 fsync(f->fd);
139
140 return 0;
141}
142
804ae586 143JournalFile* journal_file_close(JournalFile *f) {
de190aef 144 assert(f);
cec736d2 145
feb12d3e 146#ifdef HAVE_GCRYPT
b0af6f41 147 /* Write the final tag */
c586dbf1 148 if (f->seal && f->writable)
b0af6f41 149 journal_file_append_tag(f);
feb12d3e 150#endif
b0af6f41 151
26687bf8 152 journal_file_set_offline(f);
cec736d2 153
fa6ac760
LP
154 if (f->mmap && f->fd >= 0)
155 mmap_cache_close_fd(f->mmap, f->fd);
cec736d2 156
11689d2a
LP
157 if (f->fd >= 0 && f->defrag_on_close) {
158
159 /* Be friendly to btrfs: turn COW back on again now,
160 * and defragment the file. We won't write to the file
161 * ever again, hence remove all fragmentation, and
162 * reenable all the good bits COW usually provides
163 * (such as data checksumming). */
164
1ed8f8c1 165 (void) chattr_fd(f->fd, 0, FS_NOCOW_FL);
11689d2a
LP
166 (void) btrfs_defrag_fd(f->fd);
167 }
f27a3864 168
03e334a1 169 safe_close(f->fd);
cec736d2 170 free(f->path);
807e17f0 171
f649045c 172 mmap_cache_unref(f->mmap);
16e9f408 173
4743015d 174 ordered_hashmap_free_free(f->chain_cache);
a4bcff5b 175
d89c8fdf 176#if defined(HAVE_XZ) || defined(HAVE_LZ4)
807e17f0
LP
177 free(f->compress_buffer);
178#endif
179
7560fffc 180#ifdef HAVE_GCRYPT
baed47c3
LP
181 if (f->fss_file)
182 munmap(f->fss_file, PAGE_ALIGN(f->fss_file_size));
dc4ebc07 183 else
b7c9ae91
LP
184 free(f->fsprg_state);
185
186 free(f->fsprg_seed);
7560fffc
LP
187
188 if (f->hmac)
189 gcry_md_close(f->hmac);
190#endif
191
cec736d2 192 free(f);
804ae586 193 return NULL;
cec736d2
LP
194}
195
0ac38b70 196static int journal_file_init_header(JournalFile *f, JournalFile *template) {
d89c8fdf 197 Header h = {};
cec736d2
LP
198 ssize_t k;
199 int r;
200
201 assert(f);
202
7560fffc 203 memcpy(h.signature, HEADER_SIGNATURE, 8);
23b0b2b2 204 h.header_size = htole64(ALIGN64(sizeof(h)));
cec736d2 205
d89c8fdf
ZJS
206 h.incompatible_flags |= htole32(
207 f->compress_xz * HEADER_INCOMPATIBLE_COMPRESSED_XZ |
208 f->compress_lz4 * HEADER_INCOMPATIBLE_COMPRESSED_LZ4);
7560fffc 209
d89c8fdf
ZJS
210 h.compatible_flags = htole32(
211 f->seal * HEADER_COMPATIBLE_SEALED);
7560fffc 212
cec736d2
LP
213 r = sd_id128_randomize(&h.file_id);
214 if (r < 0)
215 return r;
216
0ac38b70
LP
217 if (template) {
218 h.seqnum_id = template->header->seqnum_id;
beec0085 219 h.tail_entry_seqnum = template->header->tail_entry_seqnum;
0ac38b70
LP
220 } else
221 h.seqnum_id = h.file_id;
cec736d2
LP
222
223 k = pwrite(f->fd, &h, sizeof(h), 0);
224 if (k < 0)
225 return -errno;
226
227 if (k != sizeof(h))
228 return -EIO;
229
230 return 0;
231}
232
233static int journal_file_refresh_header(JournalFile *f) {
de190aef 234 sd_id128_t boot_id;
fa6ac760 235 int r;
cec736d2
LP
236
237 assert(f);
238
239 r = sd_id128_get_machine(&f->header->machine_id);
240 if (r < 0)
241 return r;
242
de190aef 243 r = sd_id128_get_boot(&boot_id);
cec736d2
LP
244 if (r < 0)
245 return r;
246
de190aef
LP
247 if (sd_id128_equal(boot_id, f->header->boot_id))
248 f->tail_entry_monotonic_valid = true;
249
250 f->header->boot_id = boot_id;
251
fa6ac760 252 r = journal_file_set_online(f);
b788cc23 253
7560fffc 254 /* Sync the online state to disk */
a676e665 255 fsync(f->fd);
b788cc23 256
fa6ac760 257 return r;
cec736d2
LP
258}
259
260static int journal_file_verify_header(JournalFile *f) {
d89c8fdf
ZJS
261 uint32_t flags;
262
cec736d2
LP
263 assert(f);
264
7560fffc 265 if (memcmp(f->header->signature, HEADER_SIGNATURE, 8))
cec736d2
LP
266 return -EBADMSG;
267
7560fffc
LP
268 /* In both read and write mode we refuse to open files with
269 * incompatible flags we don't know */
d89c8fdf
ZJS
270 flags = le32toh(f->header->incompatible_flags);
271 if (flags & ~HEADER_INCOMPATIBLE_SUPPORTED) {
272 if (flags & ~HEADER_INCOMPATIBLE_ANY)
273 log_debug("Journal file %s has unknown incompatible flags %"PRIx32,
274 f->path, flags & ~HEADER_INCOMPATIBLE_ANY);
275 flags = (flags & HEADER_INCOMPATIBLE_ANY) & ~HEADER_INCOMPATIBLE_SUPPORTED;
276 if (flags)
277 log_debug("Journal file %s uses incompatible flags %"PRIx32
278 " disabled at compilation time.", f->path, flags);
cec736d2 279 return -EPROTONOSUPPORT;
d89c8fdf 280 }
cec736d2 281
7560fffc
LP
282 /* When open for writing we refuse to open files with
283 * compatible flags, too */
d89c8fdf
ZJS
284 flags = le32toh(f->header->compatible_flags);
285 if (f->writable && (flags & ~HEADER_COMPATIBLE_SUPPORTED)) {
286 if (flags & ~HEADER_COMPATIBLE_ANY)
287 log_debug("Journal file %s has unknown compatible flags %"PRIx32,
288 f->path, flags & ~HEADER_COMPATIBLE_ANY);
289 flags = (flags & HEADER_COMPATIBLE_ANY) & ~HEADER_COMPATIBLE_SUPPORTED;
290 if (flags)
291 log_debug("Journal file %s uses compatible flags %"PRIx32
292 " disabled at compilation time.", f->path, flags);
293 return -EPROTONOSUPPORT;
7560fffc
LP
294 }
295
db11ac1a
LP
296 if (f->header->state >= _STATE_MAX)
297 return -EBADMSG;
298
dca6219e
LP
299 /* The first addition was n_data, so check that we are at least this large */
300 if (le64toh(f->header->header_size) < HEADER_SIZE_MIN)
23b0b2b2
LP
301 return -EBADMSG;
302
8088cbd3 303 if (JOURNAL_HEADER_SEALED(f->header) && !JOURNAL_HEADER_CONTAINS(f->header, n_entry_arrays))
beec0085
LP
304 return -EBADMSG;
305
db11ac1a
LP
306 if ((le64toh(f->header->header_size) + le64toh(f->header->arena_size)) > (uint64_t) f->last_stat.st_size)
307 return -ENODATA;
308
309 if (le64toh(f->header->tail_object_offset) > (le64toh(f->header->header_size) + le64toh(f->header->arena_size)))
310 return -ENODATA;
311
7762e02b
LP
312 if (!VALID64(le64toh(f->header->data_hash_table_offset)) ||
313 !VALID64(le64toh(f->header->field_hash_table_offset)) ||
314 !VALID64(le64toh(f->header->tail_object_offset)) ||
315 !VALID64(le64toh(f->header->entry_array_offset)))
316 return -ENODATA;
317
cec736d2 318 if (f->writable) {
ccdbaf91 319 uint8_t state;
cec736d2
LP
320 sd_id128_t machine_id;
321 int r;
322
323 r = sd_id128_get_machine(&machine_id);
324 if (r < 0)
325 return r;
326
327 if (!sd_id128_equal(machine_id, f->header->machine_id))
328 return -EHOSTDOWN;
329
de190aef 330 state = f->header->state;
cec736d2 331
71fa6f00
LP
332 if (state == STATE_ONLINE) {
333 log_debug("Journal file %s is already online. Assuming unclean closing.", f->path);
334 return -EBUSY;
335 } else if (state == STATE_ARCHIVED)
cec736d2 336 return -ESHUTDOWN;
71fa6f00 337 else if (state != STATE_OFFLINE) {
8facc349 338 log_debug("Journal file %s has unknown state %i.", f->path, state);
71fa6f00
LP
339 return -EBUSY;
340 }
cec736d2
LP
341 }
342
d89c8fdf
ZJS
343 f->compress_xz = JOURNAL_HEADER_COMPRESSED_XZ(f->header);
344 f->compress_lz4 = JOURNAL_HEADER_COMPRESSED_LZ4(f->header);
c586dbf1 345
f1889c91 346 f->seal = JOURNAL_HEADER_SEALED(f->header);
7560fffc 347
cec736d2
LP
348 return 0;
349}
350
2678031a
LP
351static int journal_file_fstat(JournalFile *f) {
352 assert(f);
353 assert(f->fd >= 0);
354
355 if (fstat(f->fd, &f->last_stat) < 0)
356 return -errno;
357
358 f->last_stat_usec = now(CLOCK_MONOTONIC);
359
360 /* Refuse appending to files that are already deleted */
361 if (f->last_stat.st_nlink <= 0)
362 return -EIDRM;
363
364 return 0;
365}
366
cec736d2 367static int journal_file_allocate(JournalFile *f, uint64_t offset, uint64_t size) {
eda4b58b 368 uint64_t old_size, new_size;
fec2aa2f 369 int r;
cec736d2
LP
370
371 assert(f);
372
cec736d2 373 /* We assume that this file is not sparse, and we know that
38ac38b2 374 * for sure, since we always call posix_fallocate()
cec736d2
LP
375 * ourselves */
376
fa6ac760
LP
377 if (mmap_cache_got_sigbus(f->mmap, f->fd))
378 return -EIO;
379
cec736d2 380 old_size =
23b0b2b2 381 le64toh(f->header->header_size) +
cec736d2
LP
382 le64toh(f->header->arena_size);
383
bc85bfee 384 new_size = PAGE_ALIGN(offset + size);
23b0b2b2
LP
385 if (new_size < le64toh(f->header->header_size))
386 new_size = le64toh(f->header->header_size);
bc85bfee 387
2678031a
LP
388 if (new_size <= old_size) {
389
390 /* We already pre-allocated enough space, but before
391 * we write to it, let's check with fstat() if the
392 * file got deleted, in order make sure we don't throw
393 * away the data immediately. Don't check fstat() for
394 * all writes though, but only once ever 10s. */
395
396 if (f->last_stat_usec + LAST_STAT_REFRESH_USEC > now(CLOCK_MONOTONIC))
397 return 0;
398
399 return journal_file_fstat(f);
400 }
401
402 /* Allocate more space. */
cec736d2 403
a676e665 404 if (f->metrics.max_size > 0 && new_size > f->metrics.max_size)
bc85bfee 405 return -E2BIG;
cec736d2 406
a676e665 407 if (new_size > f->metrics.min_size && f->metrics.keep_free > 0) {
cec736d2
LP
408 struct statvfs svfs;
409
410 if (fstatvfs(f->fd, &svfs) >= 0) {
411 uint64_t available;
412
070052ab 413 available = LESS_BY((uint64_t) svfs.f_bfree * (uint64_t) svfs.f_bsize, f->metrics.keep_free);
cec736d2
LP
414
415 if (new_size - old_size > available)
416 return -E2BIG;
417 }
418 }
419
eda4b58b
LP
420 /* Increase by larger blocks at once */
421 new_size = ((new_size+FILE_SIZE_INCREASE-1) / FILE_SIZE_INCREASE) * FILE_SIZE_INCREASE;
422 if (f->metrics.max_size > 0 && new_size > f->metrics.max_size)
423 new_size = f->metrics.max_size;
424
bc85bfee
LP
425 /* Note that the glibc fallocate() fallback is very
426 inefficient, hence we try to minimize the allocation area
427 as we can. */
fec2aa2f
GV
428 r = posix_fallocate(f->fd, old_size, new_size - old_size);
429 if (r != 0)
430 return -r;
cec736d2 431
23b0b2b2 432 f->header->arena_size = htole64(new_size - le64toh(f->header->header_size));
cec736d2 433
2678031a 434 return journal_file_fstat(f);
cec736d2
LP
435}
436
78519831 437static unsigned type_to_context(ObjectType type) {
d3d3208f 438 /* One context for each type, plus one catch-all for the rest */
69adae51 439 assert_cc(_OBJECT_TYPE_MAX <= MMAP_CACHE_MAX_CONTEXTS);
fa6ac760 440 assert_cc(CONTEXT_HEADER < MMAP_CACHE_MAX_CONTEXTS);
d05089d8 441 return type > OBJECT_UNUSED && type < _OBJECT_TYPE_MAX ? type : 0;
d3d3208f
MS
442}
443
7a9dabea 444static int journal_file_move_to(JournalFile *f, ObjectType type, bool keep_always, uint64_t offset, uint64_t size, void **ret) {
2678031a
LP
445 int r;
446
cec736d2 447 assert(f);
cec736d2
LP
448 assert(ret);
449
7762e02b
LP
450 if (size <= 0)
451 return -EINVAL;
452
2a59ea54 453 /* Avoid SIGBUS on invalid accesses */
4bbdcdb3
LP
454 if (offset + size > (uint64_t) f->last_stat.st_size) {
455 /* Hmm, out of range? Let's refresh the fstat() data
456 * first, before we trust that check. */
457
2678031a
LP
458 r = journal_file_fstat(f);
459 if (r < 0)
460 return r;
461
462 if (offset + size > (uint64_t) f->last_stat.st_size)
4bbdcdb3
LP
463 return -EADDRNOTAVAIL;
464 }
465
7a9dabea 466 return mmap_cache_get(f->mmap, f->fd, f->prot, type_to_context(type), keep_always, offset, size, &f->last_stat, ret);
cec736d2
LP
467}
468
16e9f408
LP
469static uint64_t minimum_header_size(Object *o) {
470
b8e891e6 471 static const uint64_t table[] = {
16e9f408
LP
472 [OBJECT_DATA] = sizeof(DataObject),
473 [OBJECT_FIELD] = sizeof(FieldObject),
474 [OBJECT_ENTRY] = sizeof(EntryObject),
475 [OBJECT_DATA_HASH_TABLE] = sizeof(HashTableObject),
476 [OBJECT_FIELD_HASH_TABLE] = sizeof(HashTableObject),
477 [OBJECT_ENTRY_ARRAY] = sizeof(EntryArrayObject),
478 [OBJECT_TAG] = sizeof(TagObject),
479 };
480
481 if (o->object.type >= ELEMENTSOF(table) || table[o->object.type] <= 0)
482 return sizeof(ObjectHeader);
483
484 return table[o->object.type];
485}
486
78519831 487int journal_file_move_to_object(JournalFile *f, ObjectType type, uint64_t offset, Object **ret) {
cec736d2
LP
488 int r;
489 void *t;
490 Object *o;
491 uint64_t s;
492
493 assert(f);
494 assert(ret);
495
db11ac1a
LP
496 /* Objects may only be located at multiple of 64 bit */
497 if (!VALID64(offset))
498 return -EFAULT;
499
7a9dabea 500 r = journal_file_move_to(f, type, false, offset, sizeof(ObjectHeader), &t);
cec736d2
LP
501 if (r < 0)
502 return r;
503
504 o = (Object*) t;
505 s = le64toh(o->object.size);
506
507 if (s < sizeof(ObjectHeader))
508 return -EBADMSG;
509
16e9f408
LP
510 if (o->object.type <= OBJECT_UNUSED)
511 return -EBADMSG;
512
513 if (s < minimum_header_size(o))
514 return -EBADMSG;
515
d05089d8 516 if (type > OBJECT_UNUSED && o->object.type != type)
cec736d2
LP
517 return -EBADMSG;
518
519 if (s > sizeof(ObjectHeader)) {
7a9dabea 520 r = journal_file_move_to(f, type, false, offset, s, &t);
cec736d2
LP
521 if (r < 0)
522 return r;
523
524 o = (Object*) t;
525 }
526
cec736d2
LP
527 *ret = o;
528 return 0;
529}
530
d98cc1f2 531static uint64_t journal_file_entry_seqnum(JournalFile *f, uint64_t *seqnum) {
cec736d2
LP
532 uint64_t r;
533
534 assert(f);
535
beec0085 536 r = le64toh(f->header->tail_entry_seqnum) + 1;
c2373f84
LP
537
538 if (seqnum) {
de190aef 539 /* If an external seqnum counter was passed, we update
c2373f84
LP
540 * both the local and the external one, and set it to
541 * the maximum of both */
542
543 if (*seqnum + 1 > r)
544 r = *seqnum + 1;
545
546 *seqnum = r;
547 }
548
beec0085 549 f->header->tail_entry_seqnum = htole64(r);
cec736d2 550
beec0085
LP
551 if (f->header->head_entry_seqnum == 0)
552 f->header->head_entry_seqnum = htole64(r);
de190aef 553
cec736d2
LP
554 return r;
555}
556
78519831 557int journal_file_append_object(JournalFile *f, ObjectType type, uint64_t size, Object **ret, uint64_t *offset) {
cec736d2
LP
558 int r;
559 uint64_t p;
560 Object *tail, *o;
561 void *t;
562
563 assert(f);
d05089d8 564 assert(type > OBJECT_UNUSED && type < _OBJECT_TYPE_MAX);
cec736d2
LP
565 assert(size >= sizeof(ObjectHeader));
566 assert(offset);
567 assert(ret);
568
26687bf8
OS
569 r = journal_file_set_online(f);
570 if (r < 0)
571 return r;
572
cec736d2 573 p = le64toh(f->header->tail_object_offset);
cec736d2 574 if (p == 0)
23b0b2b2 575 p = le64toh(f->header->header_size);
cec736d2 576 else {
d05089d8 577 r = journal_file_move_to_object(f, OBJECT_UNUSED, p, &tail);
cec736d2
LP
578 if (r < 0)
579 return r;
580
581 p += ALIGN64(le64toh(tail->object.size));
582 }
583
584 r = journal_file_allocate(f, p, size);
585 if (r < 0)
586 return r;
587
fcde2389 588 r = journal_file_move_to(f, type, false, p, size, &t);
cec736d2
LP
589 if (r < 0)
590 return r;
591
592 o = (Object*) t;
593
594 zero(o->object);
de190aef 595 o->object.type = type;
cec736d2
LP
596 o->object.size = htole64(size);
597
598 f->header->tail_object_offset = htole64(p);
cec736d2
LP
599 f->header->n_objects = htole64(le64toh(f->header->n_objects) + 1);
600
601 *ret = o;
602 *offset = p;
603
604 return 0;
605}
606
de190aef 607static int journal_file_setup_data_hash_table(JournalFile *f) {
cec736d2
LP
608 uint64_t s, p;
609 Object *o;
610 int r;
611
612 assert(f);
613
070052ab
LP
614 /* We estimate that we need 1 hash table entry per 768 bytes
615 of journal file and we want to make sure we never get
616 beyond 75% fill level. Calculate the hash table size for
617 the maximum file size based on these metrics. */
4a92baf3 618
dfabe643 619 s = (f->metrics.max_size * 4 / 768 / 3) * sizeof(HashItem);
4a92baf3
LP
620 if (s < DEFAULT_DATA_HASH_TABLE_SIZE)
621 s = DEFAULT_DATA_HASH_TABLE_SIZE;
622
507f22bd 623 log_debug("Reserving %"PRIu64" entries in hash table.", s / sizeof(HashItem));
4a92baf3 624
de190aef
LP
625 r = journal_file_append_object(f,
626 OBJECT_DATA_HASH_TABLE,
627 offsetof(Object, hash_table.items) + s,
628 &o, &p);
cec736d2
LP
629 if (r < 0)
630 return r;
631
29804cc1 632 memzero(o->hash_table.items, s);
cec736d2 633
de190aef
LP
634 f->header->data_hash_table_offset = htole64(p + offsetof(Object, hash_table.items));
635 f->header->data_hash_table_size = htole64(s);
cec736d2
LP
636
637 return 0;
638}
639
de190aef 640static int journal_file_setup_field_hash_table(JournalFile *f) {
cec736d2
LP
641 uint64_t s, p;
642 Object *o;
643 int r;
644
645 assert(f);
646
3c1668da
LP
647 /* We use a fixed size hash table for the fields as this
648 * number should grow very slowly only */
649
de190aef
LP
650 s = DEFAULT_FIELD_HASH_TABLE_SIZE;
651 r = journal_file_append_object(f,
652 OBJECT_FIELD_HASH_TABLE,
653 offsetof(Object, hash_table.items) + s,
654 &o, &p);
cec736d2
LP
655 if (r < 0)
656 return r;
657
29804cc1 658 memzero(o->hash_table.items, s);
cec736d2 659
de190aef
LP
660 f->header->field_hash_table_offset = htole64(p + offsetof(Object, hash_table.items));
661 f->header->field_hash_table_size = htole64(s);
cec736d2
LP
662
663 return 0;
664}
665
dade37d4 666int journal_file_map_data_hash_table(JournalFile *f) {
cec736d2
LP
667 uint64_t s, p;
668 void *t;
669 int r;
670
671 assert(f);
672
dade37d4
LP
673 if (f->data_hash_table)
674 return 0;
675
de190aef
LP
676 p = le64toh(f->header->data_hash_table_offset);
677 s = le64toh(f->header->data_hash_table_size);
cec736d2 678
de190aef 679 r = journal_file_move_to(f,
16e9f408 680 OBJECT_DATA_HASH_TABLE,
fcde2389 681 true,
de190aef
LP
682 p, s,
683 &t);
cec736d2
LP
684 if (r < 0)
685 return r;
686
de190aef 687 f->data_hash_table = t;
cec736d2
LP
688 return 0;
689}
690
dade37d4 691int journal_file_map_field_hash_table(JournalFile *f) {
cec736d2
LP
692 uint64_t s, p;
693 void *t;
694 int r;
695
696 assert(f);
697
dade37d4
LP
698 if (f->field_hash_table)
699 return 0;
700
de190aef
LP
701 p = le64toh(f->header->field_hash_table_offset);
702 s = le64toh(f->header->field_hash_table_size);
cec736d2 703
de190aef 704 r = journal_file_move_to(f,
16e9f408 705 OBJECT_FIELD_HASH_TABLE,
fcde2389 706 true,
de190aef
LP
707 p, s,
708 &t);
cec736d2
LP
709 if (r < 0)
710 return r;
711
de190aef 712 f->field_hash_table = t;
cec736d2
LP
713 return 0;
714}
715
3c1668da
LP
716static int journal_file_link_field(
717 JournalFile *f,
718 Object *o,
719 uint64_t offset,
720 uint64_t hash) {
721
805d1486 722 uint64_t p, h, m;
3c1668da
LP
723 int r;
724
725 assert(f);
726 assert(o);
727 assert(offset > 0);
728
729 if (o->object.type != OBJECT_FIELD)
730 return -EINVAL;
731
805d1486
LP
732 m = le64toh(f->header->field_hash_table_size) / sizeof(HashItem);
733 if (m <= 0)
734 return -EBADMSG;
3c1668da 735
805d1486 736 /* This might alter the window we are looking at */
3c1668da
LP
737 o->field.next_hash_offset = o->field.head_data_offset = 0;
738
805d1486 739 h = hash % m;
3c1668da
LP
740 p = le64toh(f->field_hash_table[h].tail_hash_offset);
741 if (p == 0)
742 f->field_hash_table[h].head_hash_offset = htole64(offset);
743 else {
744 r = journal_file_move_to_object(f, OBJECT_FIELD, p, &o);
745 if (r < 0)
746 return r;
747
748 o->field.next_hash_offset = htole64(offset);
749 }
750
751 f->field_hash_table[h].tail_hash_offset = htole64(offset);
752
753 if (JOURNAL_HEADER_CONTAINS(f->header, n_fields))
754 f->header->n_fields = htole64(le64toh(f->header->n_fields) + 1);
755
756 return 0;
757}
758
759static int journal_file_link_data(
760 JournalFile *f,
761 Object *o,
762 uint64_t offset,
763 uint64_t hash) {
764
805d1486 765 uint64_t p, h, m;
cec736d2
LP
766 int r;
767
768 assert(f);
769 assert(o);
770 assert(offset > 0);
b588975f
LP
771
772 if (o->object.type != OBJECT_DATA)
773 return -EINVAL;
cec736d2 774
805d1486
LP
775 m = le64toh(f->header->data_hash_table_size) / sizeof(HashItem);
776 if (m <= 0)
777 return -EBADMSG;
48496df6 778
805d1486 779 /* This might alter the window we are looking at */
de190aef
LP
780 o->data.next_hash_offset = o->data.next_field_offset = 0;
781 o->data.entry_offset = o->data.entry_array_offset = 0;
782 o->data.n_entries = 0;
cec736d2 783
805d1486 784 h = hash % m;
8db4213e 785 p = le64toh(f->data_hash_table[h].tail_hash_offset);
3c1668da 786 if (p == 0)
cec736d2 787 /* Only entry in the hash table is easy */
de190aef 788 f->data_hash_table[h].head_hash_offset = htole64(offset);
3c1668da 789 else {
48496df6
LP
790 /* Move back to the previous data object, to patch in
791 * pointer */
cec736d2 792
de190aef 793 r = journal_file_move_to_object(f, OBJECT_DATA, p, &o);
cec736d2
LP
794 if (r < 0)
795 return r;
796
de190aef 797 o->data.next_hash_offset = htole64(offset);
cec736d2
LP
798 }
799
de190aef 800 f->data_hash_table[h].tail_hash_offset = htole64(offset);
cec736d2 801
dca6219e
LP
802 if (JOURNAL_HEADER_CONTAINS(f->header, n_data))
803 f->header->n_data = htole64(le64toh(f->header->n_data) + 1);
804
cec736d2
LP
805 return 0;
806}
807
3c1668da
LP
808int journal_file_find_field_object_with_hash(
809 JournalFile *f,
810 const void *field, uint64_t size, uint64_t hash,
811 Object **ret, uint64_t *offset) {
812
805d1486 813 uint64_t p, osize, h, m;
3c1668da
LP
814 int r;
815
816 assert(f);
817 assert(field && size > 0);
818
dade37d4
LP
819 /* If the field hash table is empty, we can't find anything */
820 if (le64toh(f->header->field_hash_table_size) <= 0)
821 return 0;
822
823 /* Map the field hash table, if it isn't mapped yet. */
824 r = journal_file_map_field_hash_table(f);
825 if (r < 0)
826 return r;
827
3c1668da
LP
828 osize = offsetof(Object, field.payload) + size;
829
805d1486 830 m = le64toh(f->header->field_hash_table_size) / sizeof(HashItem);
805d1486 831 if (m <= 0)
3c1668da
LP
832 return -EBADMSG;
833
805d1486 834 h = hash % m;
3c1668da
LP
835 p = le64toh(f->field_hash_table[h].head_hash_offset);
836
837 while (p > 0) {
838 Object *o;
839
840 r = journal_file_move_to_object(f, OBJECT_FIELD, p, &o);
841 if (r < 0)
842 return r;
843
844 if (le64toh(o->field.hash) == hash &&
845 le64toh(o->object.size) == osize &&
846 memcmp(o->field.payload, field, size) == 0) {
847
848 if (ret)
849 *ret = o;
850 if (offset)
851 *offset = p;
852
853 return 1;
854 }
855
856 p = le64toh(o->field.next_hash_offset);
857 }
858
859 return 0;
860}
861
862int journal_file_find_field_object(
863 JournalFile *f,
864 const void *field, uint64_t size,
865 Object **ret, uint64_t *offset) {
866
867 uint64_t hash;
868
869 assert(f);
870 assert(field && size > 0);
871
872 hash = hash64(field, size);
873
874 return journal_file_find_field_object_with_hash(f,
875 field, size, hash,
876 ret, offset);
877}
878
de190aef
LP
879int journal_file_find_data_object_with_hash(
880 JournalFile *f,
881 const void *data, uint64_t size, uint64_t hash,
882 Object **ret, uint64_t *offset) {
48496df6 883
805d1486 884 uint64_t p, osize, h, m;
cec736d2
LP
885 int r;
886
887 assert(f);
888 assert(data || size == 0);
889
dade37d4
LP
890 /* If there's no data hash table, then there's no entry. */
891 if (le64toh(f->header->data_hash_table_size) <= 0)
892 return 0;
893
894 /* Map the data hash table, if it isn't mapped yet. */
895 r = journal_file_map_data_hash_table(f);
896 if (r < 0)
897 return r;
898
cec736d2
LP
899 osize = offsetof(Object, data.payload) + size;
900
805d1486
LP
901 m = le64toh(f->header->data_hash_table_size) / sizeof(HashItem);
902 if (m <= 0)
bc85bfee
LP
903 return -EBADMSG;
904
805d1486 905 h = hash % m;
de190aef 906 p = le64toh(f->data_hash_table[h].head_hash_offset);
cec736d2 907
de190aef
LP
908 while (p > 0) {
909 Object *o;
cec736d2 910
de190aef 911 r = journal_file_move_to_object(f, OBJECT_DATA, p, &o);
cec736d2
LP
912 if (r < 0)
913 return r;
914
807e17f0 915 if (le64toh(o->data.hash) != hash)
85a131e8 916 goto next;
807e17f0 917
d89c8fdf 918 if (o->object.flags & OBJECT_COMPRESSION_MASK) {
3b1a55e1 919#if defined(HAVE_XZ) || defined(HAVE_LZ4)
fa1c4b51 920 uint64_t l;
a7f7d1bd 921 size_t rsize = 0;
cec736d2 922
807e17f0
LP
923 l = le64toh(o->object.size);
924 if (l <= offsetof(Object, data.payload))
cec736d2
LP
925 return -EBADMSG;
926
807e17f0
LP
927 l -= offsetof(Object, data.payload);
928
d89c8fdf
ZJS
929 r = decompress_blob(o->object.flags & OBJECT_COMPRESSION_MASK,
930 o->data.payload, l, &f->compress_buffer, &f->compress_buffer_size, &rsize, 0);
931 if (r < 0)
932 return r;
807e17f0 933
b785c858 934 if (rsize == size &&
807e17f0
LP
935 memcmp(f->compress_buffer, data, size) == 0) {
936
937 if (ret)
938 *ret = o;
939
940 if (offset)
941 *offset = p;
942
943 return 1;
944 }
3b1a55e1
ZJS
945#else
946 return -EPROTONOSUPPORT;
947#endif
807e17f0
LP
948 } else if (le64toh(o->object.size) == osize &&
949 memcmp(o->data.payload, data, size) == 0) {
950
cec736d2
LP
951 if (ret)
952 *ret = o;
953
954 if (offset)
955 *offset = p;
956
de190aef 957 return 1;
cec736d2
LP
958 }
959
85a131e8 960 next:
cec736d2
LP
961 p = le64toh(o->data.next_hash_offset);
962 }
963
de190aef
LP
964 return 0;
965}
966
967int journal_file_find_data_object(
968 JournalFile *f,
969 const void *data, uint64_t size,
970 Object **ret, uint64_t *offset) {
971
972 uint64_t hash;
973
974 assert(f);
975 assert(data || size == 0);
976
977 hash = hash64(data, size);
978
979 return journal_file_find_data_object_with_hash(f,
980 data, size, hash,
981 ret, offset);
982}
983
3c1668da
LP
984static int journal_file_append_field(
985 JournalFile *f,
986 const void *field, uint64_t size,
987 Object **ret, uint64_t *offset) {
988
989 uint64_t hash, p;
990 uint64_t osize;
991 Object *o;
992 int r;
993
994 assert(f);
995 assert(field && size > 0);
996
997 hash = hash64(field, size);
998
999 r = journal_file_find_field_object_with_hash(f, field, size, hash, &o, &p);
1000 if (r < 0)
1001 return r;
1002 else if (r > 0) {
1003
1004 if (ret)
1005 *ret = o;
1006
1007 if (offset)
1008 *offset = p;
1009
1010 return 0;
1011 }
1012
1013 osize = offsetof(Object, field.payload) + size;
1014 r = journal_file_append_object(f, OBJECT_FIELD, osize, &o, &p);
8c92d4bb
LP
1015 if (r < 0)
1016 return r;
3c1668da
LP
1017
1018 o->field.hash = htole64(hash);
1019 memcpy(o->field.payload, field, size);
1020
1021 r = journal_file_link_field(f, o, p, hash);
1022 if (r < 0)
1023 return r;
1024
1025 /* The linking might have altered the window, so let's
1026 * refresh our pointer */
1027 r = journal_file_move_to_object(f, OBJECT_FIELD, p, &o);
1028 if (r < 0)
1029 return r;
1030
1031#ifdef HAVE_GCRYPT
1032 r = journal_file_hmac_put_object(f, OBJECT_FIELD, o, p);
1033 if (r < 0)
1034 return r;
1035#endif
1036
1037 if (ret)
1038 *ret = o;
1039
1040 if (offset)
1041 *offset = p;
1042
1043 return 0;
1044}
1045
48496df6
LP
1046static int journal_file_append_data(
1047 JournalFile *f,
1048 const void *data, uint64_t size,
1049 Object **ret, uint64_t *offset) {
1050
de190aef
LP
1051 uint64_t hash, p;
1052 uint64_t osize;
1053 Object *o;
d89c8fdf 1054 int r, compression = 0;
3c1668da 1055 const void *eq;
de190aef
LP
1056
1057 assert(f);
1058 assert(data || size == 0);
1059
1060 hash = hash64(data, size);
1061
1062 r = journal_file_find_data_object_with_hash(f, data, size, hash, &o, &p);
1063 if (r < 0)
1064 return r;
0240c603 1065 if (r > 0) {
de190aef
LP
1066
1067 if (ret)
1068 *ret = o;
1069
1070 if (offset)
1071 *offset = p;
1072
1073 return 0;
1074 }
1075
1076 osize = offsetof(Object, data.payload) + size;
1077 r = journal_file_append_object(f, OBJECT_DATA, osize, &o, &p);
cec736d2
LP
1078 if (r < 0)
1079 return r;
1080
cec736d2 1081 o->data.hash = htole64(hash);
807e17f0 1082
d89c8fdf 1083#if defined(HAVE_XZ) || defined(HAVE_LZ4)
d1afbcd2 1084 if (JOURNAL_FILE_COMPRESS(f) && size >= COMPRESSION_SIZE_THRESHOLD) {
a7f7d1bd 1085 size_t rsize = 0;
807e17f0 1086
d89c8fdf 1087 compression = compress_blob(data, size, o->data.payload, &rsize);
807e17f0 1088
d1afbcd2 1089 if (compression >= 0) {
807e17f0 1090 o->object.size = htole64(offsetof(Object, data.payload) + rsize);
d89c8fdf 1091 o->object.flags |= compression;
807e17f0 1092
fa1c4b51 1093 log_debug("Compressed data object %"PRIu64" -> %zu using %s",
d89c8fdf 1094 size, rsize, object_compressed_to_string(compression));
d1afbcd2
LP
1095 } else
1096 /* Compression didn't work, we don't really care why, let's continue without compression */
1097 compression = 0;
807e17f0
LP
1098 }
1099#endif
1100
d1afbcd2 1101 if (compression == 0 && size > 0)
807e17f0 1102 memcpy(o->data.payload, data, size);
cec736d2 1103
de190aef 1104 r = journal_file_link_data(f, o, p, hash);
cec736d2
LP
1105 if (r < 0)
1106 return r;
1107
48496df6
LP
1108 /* The linking might have altered the window, so let's
1109 * refresh our pointer */
1110 r = journal_file_move_to_object(f, OBJECT_DATA, p, &o);
1111 if (r < 0)
1112 return r;
1113
08c6f819
SL
1114 if (!data)
1115 eq = NULL;
1116 else
1117 eq = memchr(data, '=', size);
3c1668da 1118 if (eq && eq > data) {
748db592 1119 Object *fo = NULL;
3c1668da 1120 uint64_t fp;
3c1668da
LP
1121
1122 /* Create field object ... */
1123 r = journal_file_append_field(f, data, (uint8_t*) eq - (uint8_t*) data, &fo, &fp);
1124 if (r < 0)
1125 return r;
1126
1127 /* ... and link it in. */
1128 o->data.next_field_offset = fo->field.head_data_offset;
1129 fo->field.head_data_offset = le64toh(p);
1130 }
1131
5996c7c2
LP
1132#ifdef HAVE_GCRYPT
1133 r = journal_file_hmac_put_object(f, OBJECT_DATA, o, p);
1134 if (r < 0)
1135 return r;
1136#endif
1137
cec736d2
LP
1138 if (ret)
1139 *ret = o;
1140
1141 if (offset)
de190aef 1142 *offset = p;
cec736d2
LP
1143
1144 return 0;
1145}
1146
1147uint64_t journal_file_entry_n_items(Object *o) {
1148 assert(o);
b588975f
LP
1149
1150 if (o->object.type != OBJECT_ENTRY)
1151 return 0;
cec736d2
LP
1152
1153 return (le64toh(o->object.size) - offsetof(Object, entry.items)) / sizeof(EntryItem);
1154}
1155
0284adc6 1156uint64_t journal_file_entry_array_n_items(Object *o) {
de190aef 1157 assert(o);
b588975f
LP
1158
1159 if (o->object.type != OBJECT_ENTRY_ARRAY)
1160 return 0;
de190aef
LP
1161
1162 return (le64toh(o->object.size) - offsetof(Object, entry_array.items)) / sizeof(uint64_t);
1163}
1164
fb9a24b6
LP
1165uint64_t journal_file_hash_table_n_items(Object *o) {
1166 assert(o);
b588975f
LP
1167
1168 if (o->object.type != OBJECT_DATA_HASH_TABLE &&
1169 o->object.type != OBJECT_FIELD_HASH_TABLE)
1170 return 0;
fb9a24b6
LP
1171
1172 return (le64toh(o->object.size) - offsetof(Object, hash_table.items)) / sizeof(HashItem);
1173}
1174
de190aef 1175static int link_entry_into_array(JournalFile *f,
4fd052ae
FC
1176 le64_t *first,
1177 le64_t *idx,
de190aef 1178 uint64_t p) {
cec736d2 1179 int r;
de190aef
LP
1180 uint64_t n = 0, ap = 0, q, i, a, hidx;
1181 Object *o;
1182
cec736d2 1183 assert(f);
de190aef
LP
1184 assert(first);
1185 assert(idx);
1186 assert(p > 0);
cec736d2 1187
de190aef
LP
1188 a = le64toh(*first);
1189 i = hidx = le64toh(*idx);
1190 while (a > 0) {
1191
1192 r = journal_file_move_to_object(f, OBJECT_ENTRY_ARRAY, a, &o);
1193 if (r < 0)
1194 return r;
cec736d2 1195
de190aef
LP
1196 n = journal_file_entry_array_n_items(o);
1197 if (i < n) {
1198 o->entry_array.items[i] = htole64(p);
1199 *idx = htole64(hidx + 1);
1200 return 0;
1201 }
cec736d2 1202
de190aef
LP
1203 i -= n;
1204 ap = a;
1205 a = le64toh(o->entry_array.next_entry_array_offset);
1206 }
1207
1208 if (hidx > n)
1209 n = (hidx+1) * 2;
1210 else
1211 n = n * 2;
1212
1213 if (n < 4)
1214 n = 4;
1215
1216 r = journal_file_append_object(f, OBJECT_ENTRY_ARRAY,
1217 offsetof(Object, entry_array.items) + n * sizeof(uint64_t),
1218 &o, &q);
cec736d2
LP
1219 if (r < 0)
1220 return r;
1221
feb12d3e 1222#ifdef HAVE_GCRYPT
5996c7c2 1223 r = journal_file_hmac_put_object(f, OBJECT_ENTRY_ARRAY, o, q);
b0af6f41
LP
1224 if (r < 0)
1225 return r;
feb12d3e 1226#endif
b0af6f41 1227
de190aef 1228 o->entry_array.items[i] = htole64(p);
cec736d2 1229
de190aef 1230 if (ap == 0)
7be3aa17 1231 *first = htole64(q);
cec736d2 1232 else {
de190aef 1233 r = journal_file_move_to_object(f, OBJECT_ENTRY_ARRAY, ap, &o);
cec736d2
LP
1234 if (r < 0)
1235 return r;
1236
de190aef
LP
1237 o->entry_array.next_entry_array_offset = htole64(q);
1238 }
cec736d2 1239
2dee23eb
LP
1240 if (JOURNAL_HEADER_CONTAINS(f->header, n_entry_arrays))
1241 f->header->n_entry_arrays = htole64(le64toh(f->header->n_entry_arrays) + 1);
1242
de190aef
LP
1243 *idx = htole64(hidx + 1);
1244
1245 return 0;
1246}
cec736d2 1247
de190aef 1248static int link_entry_into_array_plus_one(JournalFile *f,
4fd052ae
FC
1249 le64_t *extra,
1250 le64_t *first,
1251 le64_t *idx,
de190aef
LP
1252 uint64_t p) {
1253
1254 int r;
1255
1256 assert(f);
1257 assert(extra);
1258 assert(first);
1259 assert(idx);
1260 assert(p > 0);
1261
1262 if (*idx == 0)
1263 *extra = htole64(p);
1264 else {
4fd052ae 1265 le64_t i;
de190aef 1266
7be3aa17 1267 i = htole64(le64toh(*idx) - 1);
de190aef
LP
1268 r = link_entry_into_array(f, first, &i, p);
1269 if (r < 0)
1270 return r;
cec736d2
LP
1271 }
1272
de190aef
LP
1273 *idx = htole64(le64toh(*idx) + 1);
1274 return 0;
1275}
1276
1277static int journal_file_link_entry_item(JournalFile *f, Object *o, uint64_t offset, uint64_t i) {
1278 uint64_t p;
1279 int r;
1280 assert(f);
1281 assert(o);
1282 assert(offset > 0);
1283
1284 p = le64toh(o->entry.items[i].object_offset);
1285 if (p == 0)
1286 return -EINVAL;
1287
1288 r = journal_file_move_to_object(f, OBJECT_DATA, p, &o);
cec736d2
LP
1289 if (r < 0)
1290 return r;
1291
de190aef
LP
1292 return link_entry_into_array_plus_one(f,
1293 &o->data.entry_offset,
1294 &o->data.entry_array_offset,
1295 &o->data.n_entries,
1296 offset);
cec736d2
LP
1297}
1298
1299static int journal_file_link_entry(JournalFile *f, Object *o, uint64_t offset) {
de190aef 1300 uint64_t n, i;
cec736d2
LP
1301 int r;
1302
1303 assert(f);
1304 assert(o);
1305 assert(offset > 0);
b588975f
LP
1306
1307 if (o->object.type != OBJECT_ENTRY)
1308 return -EINVAL;
cec736d2 1309
b788cc23
LP
1310 __sync_synchronize();
1311
cec736d2 1312 /* Link up the entry itself */
de190aef
LP
1313 r = link_entry_into_array(f,
1314 &f->header->entry_array_offset,
1315 &f->header->n_entries,
1316 offset);
1317 if (r < 0)
1318 return r;
cec736d2 1319
507f22bd 1320 /* log_debug("=> %s seqnr=%"PRIu64" n_entries=%"PRIu64, f->path, o->entry.seqnum, f->header->n_entries); */
cec736d2 1321
de190aef 1322 if (f->header->head_entry_realtime == 0)
0ac38b70 1323 f->header->head_entry_realtime = o->entry.realtime;
cec736d2 1324
0ac38b70 1325 f->header->tail_entry_realtime = o->entry.realtime;
de190aef
LP
1326 f->header->tail_entry_monotonic = o->entry.monotonic;
1327
1328 f->tail_entry_monotonic_valid = true;
cec736d2
LP
1329
1330 /* Link up the items */
1331 n = journal_file_entry_n_items(o);
1332 for (i = 0; i < n; i++) {
1333 r = journal_file_link_entry_item(f, o, offset, i);
1334 if (r < 0)
1335 return r;
1336 }
1337
cec736d2
LP
1338 return 0;
1339}
1340
1341static int journal_file_append_entry_internal(
1342 JournalFile *f,
1343 const dual_timestamp *ts,
1344 uint64_t xor_hash,
1345 const EntryItem items[], unsigned n_items,
de190aef 1346 uint64_t *seqnum,
cec736d2
LP
1347 Object **ret, uint64_t *offset) {
1348 uint64_t np;
1349 uint64_t osize;
1350 Object *o;
1351 int r;
1352
1353 assert(f);
1354 assert(items || n_items == 0);
de190aef 1355 assert(ts);
cec736d2
LP
1356
1357 osize = offsetof(Object, entry.items) + (n_items * sizeof(EntryItem));
1358
de190aef 1359 r = journal_file_append_object(f, OBJECT_ENTRY, osize, &o, &np);
cec736d2
LP
1360 if (r < 0)
1361 return r;
1362
d98cc1f2 1363 o->entry.seqnum = htole64(journal_file_entry_seqnum(f, seqnum));
cec736d2 1364 memcpy(o->entry.items, items, n_items * sizeof(EntryItem));
de190aef
LP
1365 o->entry.realtime = htole64(ts->realtime);
1366 o->entry.monotonic = htole64(ts->monotonic);
cec736d2
LP
1367 o->entry.xor_hash = htole64(xor_hash);
1368 o->entry.boot_id = f->header->boot_id;
1369
feb12d3e 1370#ifdef HAVE_GCRYPT
5996c7c2 1371 r = journal_file_hmac_put_object(f, OBJECT_ENTRY, o, np);
b0af6f41
LP
1372 if (r < 0)
1373 return r;
feb12d3e 1374#endif
b0af6f41 1375
cec736d2
LP
1376 r = journal_file_link_entry(f, o, np);
1377 if (r < 0)
1378 return r;
1379
1380 if (ret)
1381 *ret = o;
1382
1383 if (offset)
1384 *offset = np;
1385
1386 return 0;
1387}
1388
cf244689 1389void journal_file_post_change(JournalFile *f) {
50f20cfd
LP
1390 assert(f);
1391
1392 /* inotify() does not receive IN_MODIFY events from file
1393 * accesses done via mmap(). After each access we hence
1394 * trigger IN_MODIFY by truncating the journal file to its
1395 * current size which triggers IN_MODIFY. */
1396
bc85bfee
LP
1397 __sync_synchronize();
1398
50f20cfd 1399 if (ftruncate(f->fd, f->last_stat.st_size) < 0)
56f64d95 1400 log_error_errno(errno, "Failed to truncate file to its own size: %m");
50f20cfd
LP
1401}
1402
1f2da9ec
LP
1403static int entry_item_cmp(const void *_a, const void *_b) {
1404 const EntryItem *a = _a, *b = _b;
1405
1406 if (le64toh(a->object_offset) < le64toh(b->object_offset))
1407 return -1;
1408 if (le64toh(a->object_offset) > le64toh(b->object_offset))
1409 return 1;
1410 return 0;
1411}
1412
de190aef 1413int journal_file_append_entry(JournalFile *f, const dual_timestamp *ts, const struct iovec iovec[], unsigned n_iovec, uint64_t *seqnum, Object **ret, uint64_t *offset) {
cec736d2
LP
1414 unsigned i;
1415 EntryItem *items;
1416 int r;
1417 uint64_t xor_hash = 0;
de190aef 1418 struct dual_timestamp _ts;
cec736d2
LP
1419
1420 assert(f);
1421 assert(iovec || n_iovec == 0);
1422
de190aef
LP
1423 if (!ts) {
1424 dual_timestamp_get(&_ts);
1425 ts = &_ts;
1426 }
1427
1428 if (f->tail_entry_monotonic_valid &&
1429 ts->monotonic < le64toh(f->header->tail_entry_monotonic))
1430 return -EINVAL;
1431
feb12d3e 1432#ifdef HAVE_GCRYPT
7560fffc
LP
1433 r = journal_file_maybe_append_tag(f, ts->realtime);
1434 if (r < 0)
1435 return r;
feb12d3e 1436#endif
7560fffc 1437
64825d3c 1438 /* alloca() can't take 0, hence let's allocate at least one */
9607d947 1439 items = alloca(sizeof(EntryItem) * MAX(1u, n_iovec));
cec736d2
LP
1440
1441 for (i = 0; i < n_iovec; i++) {
1442 uint64_t p;
1443 Object *o;
1444
1445 r = journal_file_append_data(f, iovec[i].iov_base, iovec[i].iov_len, &o, &p);
1446 if (r < 0)
cf244689 1447 return r;
cec736d2
LP
1448
1449 xor_hash ^= le64toh(o->data.hash);
1450 items[i].object_offset = htole64(p);
de7b95cd 1451 items[i].hash = o->data.hash;
cec736d2
LP
1452 }
1453
1f2da9ec
LP
1454 /* Order by the position on disk, in order to improve seek
1455 * times for rotating media. */
7ff7394d 1456 qsort_safe(items, n_iovec, sizeof(EntryItem), entry_item_cmp);
1f2da9ec 1457
de190aef 1458 r = journal_file_append_entry_internal(f, ts, xor_hash, items, n_iovec, seqnum, ret, offset);
cec736d2 1459
fa6ac760
LP
1460 /* If the memory mapping triggered a SIGBUS then we return an
1461 * IO error and ignore the error code passed down to us, since
1462 * it is very likely just an effect of a nullified replacement
1463 * mapping page */
1464
1465 if (mmap_cache_got_sigbus(f->mmap, f->fd))
1466 r = -EIO;
1467
50f20cfd
LP
1468 journal_file_post_change(f);
1469
cec736d2
LP
1470 return r;
1471}
1472
a4bcff5b 1473typedef struct ChainCacheItem {
fb099c8d 1474 uint64_t first; /* the array at the beginning of the chain */
a4bcff5b
LP
1475 uint64_t array; /* the cached array */
1476 uint64_t begin; /* the first item in the cached array */
1477 uint64_t total; /* the total number of items in all arrays before this one in the chain */
f268980d 1478 uint64_t last_index; /* the last index we looked at, to optimize locality when bisecting */
a4bcff5b
LP
1479} ChainCacheItem;
1480
1481static void chain_cache_put(
4743015d 1482 OrderedHashmap *h,
a4bcff5b
LP
1483 ChainCacheItem *ci,
1484 uint64_t first,
1485 uint64_t array,
1486 uint64_t begin,
f268980d
LP
1487 uint64_t total,
1488 uint64_t last_index) {
a4bcff5b
LP
1489
1490 if (!ci) {
34741aa3
LP
1491 /* If the chain item to cache for this chain is the
1492 * first one it's not worth caching anything */
1493 if (array == first)
1494 return;
1495
29433089 1496 if (ordered_hashmap_size(h) >= CHAIN_CACHE_MAX) {
4743015d 1497 ci = ordered_hashmap_steal_first(h);
29433089
LP
1498 assert(ci);
1499 } else {
a4bcff5b
LP
1500 ci = new(ChainCacheItem, 1);
1501 if (!ci)
1502 return;
1503 }
1504
1505 ci->first = first;
1506
4743015d 1507 if (ordered_hashmap_put(h, &ci->first, ci) < 0) {
a4bcff5b
LP
1508 free(ci);
1509 return;
1510 }
1511 } else
1512 assert(ci->first == first);
1513
1514 ci->array = array;
1515 ci->begin = begin;
1516 ci->total = total;
f268980d 1517 ci->last_index = last_index;
a4bcff5b
LP
1518}
1519
f268980d
LP
1520static int generic_array_get(
1521 JournalFile *f,
1522 uint64_t first,
1523 uint64_t i,
1524 Object **ret, uint64_t *offset) {
de190aef 1525
cec736d2 1526 Object *o;
a4bcff5b 1527 uint64_t p = 0, a, t = 0;
cec736d2 1528 int r;
a4bcff5b 1529 ChainCacheItem *ci;
cec736d2
LP
1530
1531 assert(f);
1532
de190aef 1533 a = first;
a4bcff5b
LP
1534
1535 /* Try the chain cache first */
4743015d 1536 ci = ordered_hashmap_get(f->chain_cache, &first);
a4bcff5b
LP
1537 if (ci && i > ci->total) {
1538 a = ci->array;
1539 i -= ci->total;
1540 t = ci->total;
1541 }
1542
de190aef 1543 while (a > 0) {
a4bcff5b 1544 uint64_t k;
cec736d2 1545
de190aef
LP
1546 r = journal_file_move_to_object(f, OBJECT_ENTRY_ARRAY, a, &o);
1547 if (r < 0)
1548 return r;
cec736d2 1549
a4bcff5b
LP
1550 k = journal_file_entry_array_n_items(o);
1551 if (i < k) {
de190aef 1552 p = le64toh(o->entry_array.items[i]);
a4bcff5b 1553 goto found;
cec736d2
LP
1554 }
1555
a4bcff5b
LP
1556 i -= k;
1557 t += k;
de190aef
LP
1558 a = le64toh(o->entry_array.next_entry_array_offset);
1559 }
1560
a4bcff5b
LP
1561 return 0;
1562
1563found:
1564 /* Let's cache this item for the next invocation */
af13a6b0 1565 chain_cache_put(f->chain_cache, ci, first, a, le64toh(o->entry_array.items[0]), t, i);
de190aef
LP
1566
1567 r = journal_file_move_to_object(f, OBJECT_ENTRY, p, &o);
1568 if (r < 0)
1569 return r;
1570
1571 if (ret)
1572 *ret = o;
1573
1574 if (offset)
1575 *offset = p;
1576
1577 return 1;
1578}
1579
f268980d
LP
1580static int generic_array_get_plus_one(
1581 JournalFile *f,
1582 uint64_t extra,
1583 uint64_t first,
1584 uint64_t i,
1585 Object **ret, uint64_t *offset) {
de190aef
LP
1586
1587 Object *o;
1588
1589 assert(f);
1590
1591 if (i == 0) {
1592 int r;
1593
1594 r = journal_file_move_to_object(f, OBJECT_ENTRY, extra, &o);
cec736d2
LP
1595 if (r < 0)
1596 return r;
1597
de190aef
LP
1598 if (ret)
1599 *ret = o;
cec736d2 1600
de190aef
LP
1601 if (offset)
1602 *offset = extra;
cec736d2 1603
de190aef 1604 return 1;
cec736d2
LP
1605 }
1606
de190aef
LP
1607 return generic_array_get(f, first, i-1, ret, offset);
1608}
cec736d2 1609
de190aef
LP
1610enum {
1611 TEST_FOUND,
1612 TEST_LEFT,
1613 TEST_RIGHT
1614};
cec736d2 1615
f268980d
LP
1616static int generic_array_bisect(
1617 JournalFile *f,
1618 uint64_t first,
1619 uint64_t n,
1620 uint64_t needle,
1621 int (*test_object)(JournalFile *f, uint64_t p, uint64_t needle),
1622 direction_t direction,
1623 Object **ret,
1624 uint64_t *offset,
1625 uint64_t *idx) {
1626
1627 uint64_t a, p, t = 0, i = 0, last_p = 0, last_index = (uint64_t) -1;
de190aef
LP
1628 bool subtract_one = false;
1629 Object *o, *array = NULL;
1630 int r;
a4bcff5b 1631 ChainCacheItem *ci;
cec736d2 1632
de190aef
LP
1633 assert(f);
1634 assert(test_object);
cec736d2 1635
a4bcff5b 1636 /* Start with the first array in the chain */
de190aef 1637 a = first;
a4bcff5b 1638
4743015d 1639 ci = ordered_hashmap_get(f->chain_cache, &first);
a4bcff5b
LP
1640 if (ci && n > ci->total) {
1641 /* Ah, we have iterated this bisection array chain
1642 * previously! Let's see if we can skip ahead in the
1643 * chain, as far as the last time. But we can't jump
1644 * backwards in the chain, so let's check that
1645 * first. */
1646
1647 r = test_object(f, ci->begin, needle);
1648 if (r < 0)
1649 return r;
1650
1651 if (r == TEST_LEFT) {
f268980d 1652 /* OK, what we are looking for is right of the
a4bcff5b
LP
1653 * begin of this EntryArray, so let's jump
1654 * straight to previously cached array in the
1655 * chain */
1656
1657 a = ci->array;
1658 n -= ci->total;
1659 t = ci->total;
f268980d 1660 last_index = ci->last_index;
a4bcff5b
LP
1661 }
1662 }
1663
de190aef
LP
1664 while (a > 0) {
1665 uint64_t left, right, k, lp;
1666
1667 r = journal_file_move_to_object(f, OBJECT_ENTRY_ARRAY, a, &array);
cec736d2
LP
1668 if (r < 0)
1669 return r;
1670
de190aef
LP
1671 k = journal_file_entry_array_n_items(array);
1672 right = MIN(k, n);
1673 if (right <= 0)
1674 return 0;
cec736d2 1675
de190aef
LP
1676 i = right - 1;
1677 lp = p = le64toh(array->entry_array.items[i]);
1678 if (p <= 0)
1679 return -EBADMSG;
cec736d2 1680
de190aef
LP
1681 r = test_object(f, p, needle);
1682 if (r < 0)
1683 return r;
cec736d2 1684
de190aef
LP
1685 if (r == TEST_FOUND)
1686 r = direction == DIRECTION_DOWN ? TEST_RIGHT : TEST_LEFT;
1687
1688 if (r == TEST_RIGHT) {
1689 left = 0;
1690 right -= 1;
f268980d
LP
1691
1692 if (last_index != (uint64_t) -1) {
1693 assert(last_index <= right);
1694
1695 /* If we cached the last index we
1696 * looked at, let's try to not to jump
1697 * too wildly around and see if we can
1698 * limit the range to look at early to
1699 * the immediate neighbors of the last
1700 * index we looked at. */
1701
1702 if (last_index > 0) {
1703 uint64_t x = last_index - 1;
1704
1705 p = le64toh(array->entry_array.items[x]);
1706 if (p <= 0)
1707 return -EBADMSG;
1708
1709 r = test_object(f, p, needle);
1710 if (r < 0)
1711 return r;
1712
1713 if (r == TEST_FOUND)
1714 r = direction == DIRECTION_DOWN ? TEST_RIGHT : TEST_LEFT;
1715
1716 if (r == TEST_RIGHT)
1717 right = x;
1718 else
1719 left = x + 1;
1720 }
1721
1722 if (last_index < right) {
1723 uint64_t y = last_index + 1;
1724
1725 p = le64toh(array->entry_array.items[y]);
1726 if (p <= 0)
1727 return -EBADMSG;
1728
1729 r = test_object(f, p, needle);
1730 if (r < 0)
1731 return r;
1732
1733 if (r == TEST_FOUND)
1734 r = direction == DIRECTION_DOWN ? TEST_RIGHT : TEST_LEFT;
1735
1736 if (r == TEST_RIGHT)
1737 right = y;
1738 else
1739 left = y + 1;
1740 }
f268980d
LP
1741 }
1742
de190aef
LP
1743 for (;;) {
1744 if (left == right) {
1745 if (direction == DIRECTION_UP)
1746 subtract_one = true;
1747
1748 i = left;
1749 goto found;
1750 }
1751
1752 assert(left < right);
de190aef 1753 i = (left + right) / 2;
f268980d 1754
de190aef
LP
1755 p = le64toh(array->entry_array.items[i]);
1756 if (p <= 0)
1757 return -EBADMSG;
1758
1759 r = test_object(f, p, needle);
1760 if (r < 0)
1761 return r;
cec736d2 1762
de190aef
LP
1763 if (r == TEST_FOUND)
1764 r = direction == DIRECTION_DOWN ? TEST_RIGHT : TEST_LEFT;
1765
1766 if (r == TEST_RIGHT)
1767 right = i;
1768 else
1769 left = i + 1;
1770 }
1771 }
1772
2173cbf8 1773 if (k >= n) {
cbdca852
LP
1774 if (direction == DIRECTION_UP) {
1775 i = n;
1776 subtract_one = true;
1777 goto found;
1778 }
1779
cec736d2 1780 return 0;
cbdca852 1781 }
cec736d2 1782
de190aef
LP
1783 last_p = lp;
1784
1785 n -= k;
1786 t += k;
f268980d 1787 last_index = (uint64_t) -1;
de190aef 1788 a = le64toh(array->entry_array.next_entry_array_offset);
cec736d2
LP
1789 }
1790
1791 return 0;
de190aef
LP
1792
1793found:
1794 if (subtract_one && t == 0 && i == 0)
1795 return 0;
1796
a4bcff5b 1797 /* Let's cache this item for the next invocation */
af13a6b0 1798 chain_cache_put(f->chain_cache, ci, first, a, le64toh(array->entry_array.items[0]), t, subtract_one ? (i > 0 ? i-1 : (uint64_t) -1) : i);
a4bcff5b 1799
de190aef
LP
1800 if (subtract_one && i == 0)
1801 p = last_p;
1802 else if (subtract_one)
1803 p = le64toh(array->entry_array.items[i-1]);
1804 else
1805 p = le64toh(array->entry_array.items[i]);
1806
1807 r = journal_file_move_to_object(f, OBJECT_ENTRY, p, &o);
1808 if (r < 0)
1809 return r;
1810
1811 if (ret)
1812 *ret = o;
1813
1814 if (offset)
1815 *offset = p;
1816
1817 if (idx)
cbdca852 1818 *idx = t + i + (subtract_one ? -1 : 0);
de190aef
LP
1819
1820 return 1;
cec736d2
LP
1821}
1822
f268980d
LP
1823static int generic_array_bisect_plus_one(
1824 JournalFile *f,
1825 uint64_t extra,
1826 uint64_t first,
1827 uint64_t n,
1828 uint64_t needle,
1829 int (*test_object)(JournalFile *f, uint64_t p, uint64_t needle),
1830 direction_t direction,
1831 Object **ret,
1832 uint64_t *offset,
1833 uint64_t *idx) {
de190aef 1834
cec736d2 1835 int r;
cbdca852
LP
1836 bool step_back = false;
1837 Object *o;
cec736d2
LP
1838
1839 assert(f);
de190aef 1840 assert(test_object);
cec736d2 1841
de190aef
LP
1842 if (n <= 0)
1843 return 0;
cec736d2 1844
de190aef
LP
1845 /* This bisects the array in object 'first', but first checks
1846 * an extra */
de190aef
LP
1847 r = test_object(f, extra, needle);
1848 if (r < 0)
1849 return r;
a536e261
LP
1850
1851 if (r == TEST_FOUND)
1852 r = direction == DIRECTION_DOWN ? TEST_RIGHT : TEST_LEFT;
1853
cbdca852
LP
1854 /* if we are looking with DIRECTION_UP then we need to first
1855 see if in the actual array there is a matching entry, and
1856 return the last one of that. But if there isn't any we need
1857 to return this one. Hence remember this, and return it
1858 below. */
1859 if (r == TEST_LEFT)
1860 step_back = direction == DIRECTION_UP;
de190aef 1861
cbdca852
LP
1862 if (r == TEST_RIGHT) {
1863 if (direction == DIRECTION_DOWN)
1864 goto found;
1865 else
1866 return 0;
a536e261 1867 }
cec736d2 1868
de190aef
LP
1869 r = generic_array_bisect(f, first, n-1, needle, test_object, direction, ret, offset, idx);
1870
cbdca852
LP
1871 if (r == 0 && step_back)
1872 goto found;
1873
ecf68b1d 1874 if (r > 0 && idx)
de190aef
LP
1875 (*idx) ++;
1876
1877 return r;
cbdca852
LP
1878
1879found:
1880 r = journal_file_move_to_object(f, OBJECT_ENTRY, extra, &o);
1881 if (r < 0)
1882 return r;
1883
1884 if (ret)
1885 *ret = o;
1886
1887 if (offset)
1888 *offset = extra;
1889
1890 if (idx)
1891 *idx = 0;
1892
1893 return 1;
1894}
1895
44a6b1b6 1896_pure_ static int test_object_offset(JournalFile *f, uint64_t p, uint64_t needle) {
cbdca852
LP
1897 assert(f);
1898 assert(p > 0);
1899
1900 if (p == needle)
1901 return TEST_FOUND;
1902 else if (p < needle)
1903 return TEST_LEFT;
1904 else
1905 return TEST_RIGHT;
1906}
1907
de190aef
LP
1908static int test_object_seqnum(JournalFile *f, uint64_t p, uint64_t needle) {
1909 Object *o;
1910 int r;
1911
1912 assert(f);
1913 assert(p > 0);
1914
1915 r = journal_file_move_to_object(f, OBJECT_ENTRY, p, &o);
cec736d2
LP
1916 if (r < 0)
1917 return r;
1918
de190aef
LP
1919 if (le64toh(o->entry.seqnum) == needle)
1920 return TEST_FOUND;
1921 else if (le64toh(o->entry.seqnum) < needle)
1922 return TEST_LEFT;
1923 else
1924 return TEST_RIGHT;
1925}
cec736d2 1926
de190aef
LP
1927int journal_file_move_to_entry_by_seqnum(
1928 JournalFile *f,
1929 uint64_t seqnum,
1930 direction_t direction,
1931 Object **ret,
1932 uint64_t *offset) {
1933
1934 return generic_array_bisect(f,
1935 le64toh(f->header->entry_array_offset),
1936 le64toh(f->header->n_entries),
1937 seqnum,
1938 test_object_seqnum,
1939 direction,
1940 ret, offset, NULL);
1941}
cec736d2 1942
de190aef
LP
1943static int test_object_realtime(JournalFile *f, uint64_t p, uint64_t needle) {
1944 Object *o;
1945 int r;
1946
1947 assert(f);
1948 assert(p > 0);
1949
1950 r = journal_file_move_to_object(f, OBJECT_ENTRY, p, &o);
1951 if (r < 0)
1952 return r;
1953
1954 if (le64toh(o->entry.realtime) == needle)
1955 return TEST_FOUND;
1956 else if (le64toh(o->entry.realtime) < needle)
1957 return TEST_LEFT;
1958 else
1959 return TEST_RIGHT;
cec736d2
LP
1960}
1961
de190aef
LP
1962int journal_file_move_to_entry_by_realtime(
1963 JournalFile *f,
1964 uint64_t realtime,
1965 direction_t direction,
1966 Object **ret,
1967 uint64_t *offset) {
1968
1969 return generic_array_bisect(f,
1970 le64toh(f->header->entry_array_offset),
1971 le64toh(f->header->n_entries),
1972 realtime,
1973 test_object_realtime,
1974 direction,
1975 ret, offset, NULL);
1976}
1977
1978static int test_object_monotonic(JournalFile *f, uint64_t p, uint64_t needle) {
1979 Object *o;
1980 int r;
1981
1982 assert(f);
1983 assert(p > 0);
1984
1985 r = journal_file_move_to_object(f, OBJECT_ENTRY, p, &o);
1986 if (r < 0)
1987 return r;
1988
1989 if (le64toh(o->entry.monotonic) == needle)
1990 return TEST_FOUND;
1991 else if (le64toh(o->entry.monotonic) < needle)
1992 return TEST_LEFT;
1993 else
1994 return TEST_RIGHT;
1995}
1996
2a560338 1997static int find_data_object_by_boot_id(
47838ab3
ZJS
1998 JournalFile *f,
1999 sd_id128_t boot_id,
2000 Object **o,
2001 uint64_t *b) {
2a560338 2002
47838ab3
ZJS
2003 char t[sizeof("_BOOT_ID=")-1 + 32 + 1] = "_BOOT_ID=";
2004
2005 sd_id128_to_string(boot_id, t + 9);
2006 return journal_file_find_data_object(f, t, sizeof(t) - 1, o, b);
2007}
2008
de190aef
LP
2009int journal_file_move_to_entry_by_monotonic(
2010 JournalFile *f,
2011 sd_id128_t boot_id,
2012 uint64_t monotonic,
2013 direction_t direction,
2014 Object **ret,
2015 uint64_t *offset) {
2016
de190aef
LP
2017 Object *o;
2018 int r;
2019
cbdca852 2020 assert(f);
de190aef 2021
47838ab3 2022 r = find_data_object_by_boot_id(f, boot_id, &o, NULL);
de190aef
LP
2023 if (r < 0)
2024 return r;
cbdca852 2025 if (r == 0)
de190aef
LP
2026 return -ENOENT;
2027
2028 return generic_array_bisect_plus_one(f,
2029 le64toh(o->data.entry_offset),
2030 le64toh(o->data.entry_array_offset),
2031 le64toh(o->data.n_entries),
2032 monotonic,
2033 test_object_monotonic,
2034 direction,
2035 ret, offset, NULL);
2036}
2037
1fc605b0 2038void journal_file_reset_location(JournalFile *f) {
6573ef05 2039 f->location_type = LOCATION_HEAD;
1fc605b0 2040 f->current_offset = 0;
6573ef05
MS
2041 f->current_seqnum = 0;
2042 f->current_realtime = 0;
2043 f->current_monotonic = 0;
2044 zero(f->current_boot_id);
2045 f->current_xor_hash = 0;
2046}
2047
950c07d4 2048void journal_file_save_location(JournalFile *f, Object *o, uint64_t offset) {
6573ef05
MS
2049 f->location_type = LOCATION_SEEK;
2050 f->current_offset = offset;
2051 f->current_seqnum = le64toh(o->entry.seqnum);
2052 f->current_realtime = le64toh(o->entry.realtime);
2053 f->current_monotonic = le64toh(o->entry.monotonic);
2054 f->current_boot_id = o->entry.boot_id;
2055 f->current_xor_hash = le64toh(o->entry.xor_hash);
1fc605b0
MS
2056}
2057
d8ae66d7
MS
2058int journal_file_compare_locations(JournalFile *af, JournalFile *bf) {
2059 assert(af);
2060 assert(bf);
2061 assert(af->location_type == LOCATION_SEEK);
2062 assert(bf->location_type == LOCATION_SEEK);
2063
2064 /* If contents and timestamps match, these entries are
2065 * identical, even if the seqnum does not match */
2066 if (sd_id128_equal(af->current_boot_id, bf->current_boot_id) &&
2067 af->current_monotonic == bf->current_monotonic &&
2068 af->current_realtime == bf->current_realtime &&
2069 af->current_xor_hash == bf->current_xor_hash)
2070 return 0;
2071
2072 if (sd_id128_equal(af->header->seqnum_id, bf->header->seqnum_id)) {
2073
2074 /* If this is from the same seqnum source, compare
2075 * seqnums */
2076 if (af->current_seqnum < bf->current_seqnum)
2077 return -1;
2078 if (af->current_seqnum > bf->current_seqnum)
2079 return 1;
2080
2081 /* Wow! This is weird, different data but the same
2082 * seqnums? Something is borked, but let's make the
2083 * best of it and compare by time. */
2084 }
2085
2086 if (sd_id128_equal(af->current_boot_id, bf->current_boot_id)) {
2087
2088 /* If the boot id matches, compare monotonic time */
2089 if (af->current_monotonic < bf->current_monotonic)
2090 return -1;
2091 if (af->current_monotonic > bf->current_monotonic)
2092 return 1;
2093 }
2094
2095 /* Otherwise, compare UTC time */
2096 if (af->current_realtime < bf->current_realtime)
2097 return -1;
2098 if (af->current_realtime > bf->current_realtime)
2099 return 1;
2100
2101 /* Finally, compare by contents */
2102 if (af->current_xor_hash < bf->current_xor_hash)
2103 return -1;
2104 if (af->current_xor_hash > bf->current_xor_hash)
2105 return 1;
2106
2107 return 0;
2108}
2109
de190aef
LP
2110int journal_file_next_entry(
2111 JournalFile *f,
f534928a 2112 uint64_t p,
de190aef
LP
2113 direction_t direction,
2114 Object **ret, uint64_t *offset) {
2115
fb099c8d 2116 uint64_t i, n, ofs;
cec736d2
LP
2117 int r;
2118
2119 assert(f);
de190aef
LP
2120
2121 n = le64toh(f->header->n_entries);
2122 if (n <= 0)
2123 return 0;
cec736d2 2124
f534928a 2125 if (p == 0)
de190aef 2126 i = direction == DIRECTION_DOWN ? 0 : n - 1;
cec736d2 2127 else {
de190aef
LP
2128 r = generic_array_bisect(f,
2129 le64toh(f->header->entry_array_offset),
2130 le64toh(f->header->n_entries),
2131 p,
2132 test_object_offset,
2133 DIRECTION_DOWN,
2134 NULL, NULL,
2135 &i);
2136 if (r <= 0)
2137 return r;
2138
2139 if (direction == DIRECTION_DOWN) {
2140 if (i >= n - 1)
2141 return 0;
2142
2143 i++;
2144 } else {
2145 if (i <= 0)
2146 return 0;
2147
2148 i--;
2149 }
cec736d2
LP
2150 }
2151
de190aef 2152 /* And jump to it */
fb099c8d
ZJS
2153 r = generic_array_get(f,
2154 le64toh(f->header->entry_array_offset),
2155 i,
2156 ret, &ofs);
2157 if (r <= 0)
2158 return r;
2159
2160 if (p > 0 &&
2161 (direction == DIRECTION_DOWN ? ofs <= p : ofs >= p)) {
2162 log_debug("%s: entry array corrupted at entry %"PRIu64,
2163 f->path, i);
2164 return -EBADMSG;
2165 }
2166
2167 if (offset)
2168 *offset = ofs;
2169
2170 return 1;
de190aef 2171}
cec736d2 2172
de190aef
LP
2173int journal_file_next_entry_for_data(
2174 JournalFile *f,
2175 Object *o, uint64_t p,
2176 uint64_t data_offset,
2177 direction_t direction,
2178 Object **ret, uint64_t *offset) {
2179
2180 uint64_t n, i;
cec736d2 2181 int r;
de190aef 2182 Object *d;
cec736d2
LP
2183
2184 assert(f);
de190aef 2185 assert(p > 0 || !o);
cec736d2 2186
de190aef 2187 r = journal_file_move_to_object(f, OBJECT_DATA, data_offset, &d);
466ccd92 2188 if (r < 0)
de190aef 2189 return r;
cec736d2 2190
de190aef
LP
2191 n = le64toh(d->data.n_entries);
2192 if (n <= 0)
2193 return n;
cec736d2 2194
de190aef
LP
2195 if (!o)
2196 i = direction == DIRECTION_DOWN ? 0 : n - 1;
2197 else {
2198 if (o->object.type != OBJECT_ENTRY)
2199 return -EINVAL;
cec736d2 2200
de190aef
LP
2201 r = generic_array_bisect_plus_one(f,
2202 le64toh(d->data.entry_offset),
2203 le64toh(d->data.entry_array_offset),
2204 le64toh(d->data.n_entries),
2205 p,
2206 test_object_offset,
2207 DIRECTION_DOWN,
2208 NULL, NULL,
2209 &i);
2210
2211 if (r <= 0)
cec736d2
LP
2212 return r;
2213
de190aef
LP
2214 if (direction == DIRECTION_DOWN) {
2215 if (i >= n - 1)
2216 return 0;
cec736d2 2217
de190aef
LP
2218 i++;
2219 } else {
2220 if (i <= 0)
2221 return 0;
cec736d2 2222
de190aef
LP
2223 i--;
2224 }
cec736d2 2225
de190aef 2226 }
cec736d2 2227
de190aef
LP
2228 return generic_array_get_plus_one(f,
2229 le64toh(d->data.entry_offset),
2230 le64toh(d->data.entry_array_offset),
2231 i,
2232 ret, offset);
2233}
cec736d2 2234
cbdca852
LP
2235int journal_file_move_to_entry_by_offset_for_data(
2236 JournalFile *f,
2237 uint64_t data_offset,
2238 uint64_t p,
2239 direction_t direction,
2240 Object **ret, uint64_t *offset) {
2241
2242 int r;
2243 Object *d;
2244
2245 assert(f);
2246
2247 r = journal_file_move_to_object(f, OBJECT_DATA, data_offset, &d);
2248 if (r < 0)
2249 return r;
2250
2251 return generic_array_bisect_plus_one(f,
2252 le64toh(d->data.entry_offset),
2253 le64toh(d->data.entry_array_offset),
2254 le64toh(d->data.n_entries),
2255 p,
2256 test_object_offset,
2257 direction,
2258 ret, offset, NULL);
2259}
2260
2261int journal_file_move_to_entry_by_monotonic_for_data(
2262 JournalFile *f,
2263 uint64_t data_offset,
2264 sd_id128_t boot_id,
2265 uint64_t monotonic,
2266 direction_t direction,
2267 Object **ret, uint64_t *offset) {
2268
cbdca852
LP
2269 Object *o, *d;
2270 int r;
2271 uint64_t b, z;
2272
2273 assert(f);
2274
2275 /* First, seek by time */
47838ab3 2276 r = find_data_object_by_boot_id(f, boot_id, &o, &b);
cbdca852
LP
2277 if (r < 0)
2278 return r;
2279 if (r == 0)
2280 return -ENOENT;
2281
2282 r = generic_array_bisect_plus_one(f,
2283 le64toh(o->data.entry_offset),
2284 le64toh(o->data.entry_array_offset),
2285 le64toh(o->data.n_entries),
2286 monotonic,
2287 test_object_monotonic,
2288 direction,
2289 NULL, &z, NULL);
2290 if (r <= 0)
2291 return r;
2292
2293 /* And now, continue seeking until we find an entry that
2294 * exists in both bisection arrays */
2295
2296 for (;;) {
2297 Object *qo;
2298 uint64_t p, q;
2299
2300 r = journal_file_move_to_object(f, OBJECT_DATA, data_offset, &d);
2301 if (r < 0)
2302 return r;
2303
2304 r = generic_array_bisect_plus_one(f,
2305 le64toh(d->data.entry_offset),
2306 le64toh(d->data.entry_array_offset),
2307 le64toh(d->data.n_entries),
2308 z,
2309 test_object_offset,
2310 direction,
2311 NULL, &p, NULL);
2312 if (r <= 0)
2313 return r;
2314
2315 r = journal_file_move_to_object(f, OBJECT_DATA, b, &o);
2316 if (r < 0)
2317 return r;
2318
2319 r = generic_array_bisect_plus_one(f,
2320 le64toh(o->data.entry_offset),
2321 le64toh(o->data.entry_array_offset),
2322 le64toh(o->data.n_entries),
2323 p,
2324 test_object_offset,
2325 direction,
2326 &qo, &q, NULL);
2327
2328 if (r <= 0)
2329 return r;
2330
2331 if (p == q) {
2332 if (ret)
2333 *ret = qo;
2334 if (offset)
2335 *offset = q;
2336
2337 return 1;
2338 }
2339
2340 z = q;
2341 }
cbdca852
LP
2342}
2343
de190aef
LP
2344int journal_file_move_to_entry_by_seqnum_for_data(
2345 JournalFile *f,
2346 uint64_t data_offset,
2347 uint64_t seqnum,
2348 direction_t direction,
2349 Object **ret, uint64_t *offset) {
cec736d2 2350
de190aef
LP
2351 Object *d;
2352 int r;
cec736d2 2353
91a31dde
LP
2354 assert(f);
2355
de190aef 2356 r = journal_file_move_to_object(f, OBJECT_DATA, data_offset, &d);
91a31dde 2357 if (r < 0)
de190aef 2358 return r;
cec736d2 2359
de190aef
LP
2360 return generic_array_bisect_plus_one(f,
2361 le64toh(d->data.entry_offset),
2362 le64toh(d->data.entry_array_offset),
2363 le64toh(d->data.n_entries),
2364 seqnum,
2365 test_object_seqnum,
2366 direction,
2367 ret, offset, NULL);
2368}
cec736d2 2369
de190aef
LP
2370int journal_file_move_to_entry_by_realtime_for_data(
2371 JournalFile *f,
2372 uint64_t data_offset,
2373 uint64_t realtime,
2374 direction_t direction,
2375 Object **ret, uint64_t *offset) {
2376
2377 Object *d;
2378 int r;
2379
91a31dde
LP
2380 assert(f);
2381
de190aef 2382 r = journal_file_move_to_object(f, OBJECT_DATA, data_offset, &d);
91a31dde 2383 if (r < 0)
de190aef
LP
2384 return r;
2385
2386 return generic_array_bisect_plus_one(f,
2387 le64toh(d->data.entry_offset),
2388 le64toh(d->data.entry_array_offset),
2389 le64toh(d->data.n_entries),
2390 realtime,
2391 test_object_realtime,
2392 direction,
2393 ret, offset, NULL);
cec736d2
LP
2394}
2395
0284adc6 2396void journal_file_dump(JournalFile *f) {
7560fffc 2397 Object *o;
7560fffc 2398 int r;
0284adc6 2399 uint64_t p;
7560fffc
LP
2400
2401 assert(f);
2402
0284adc6 2403 journal_file_print_header(f);
7560fffc 2404
0284adc6
LP
2405 p = le64toh(f->header->header_size);
2406 while (p != 0) {
d05089d8 2407 r = journal_file_move_to_object(f, OBJECT_UNUSED, p, &o);
0284adc6
LP
2408 if (r < 0)
2409 goto fail;
7560fffc 2410
0284adc6 2411 switch (o->object.type) {
d98cc1f2 2412
0284adc6
LP
2413 case OBJECT_UNUSED:
2414 printf("Type: OBJECT_UNUSED\n");
2415 break;
d98cc1f2 2416
0284adc6
LP
2417 case OBJECT_DATA:
2418 printf("Type: OBJECT_DATA\n");
2419 break;
7560fffc 2420
3c1668da
LP
2421 case OBJECT_FIELD:
2422 printf("Type: OBJECT_FIELD\n");
2423 break;
2424
0284adc6 2425 case OBJECT_ENTRY:
507f22bd
ZJS
2426 printf("Type: OBJECT_ENTRY seqnum=%"PRIu64" monotonic=%"PRIu64" realtime=%"PRIu64"\n",
2427 le64toh(o->entry.seqnum),
2428 le64toh(o->entry.monotonic),
2429 le64toh(o->entry.realtime));
0284adc6 2430 break;
7560fffc 2431
0284adc6
LP
2432 case OBJECT_FIELD_HASH_TABLE:
2433 printf("Type: OBJECT_FIELD_HASH_TABLE\n");
2434 break;
7560fffc 2435
0284adc6
LP
2436 case OBJECT_DATA_HASH_TABLE:
2437 printf("Type: OBJECT_DATA_HASH_TABLE\n");
2438 break;
7560fffc 2439
0284adc6
LP
2440 case OBJECT_ENTRY_ARRAY:
2441 printf("Type: OBJECT_ENTRY_ARRAY\n");
2442 break;
7560fffc 2443
0284adc6 2444 case OBJECT_TAG:
507f22bd
ZJS
2445 printf("Type: OBJECT_TAG seqnum=%"PRIu64" epoch=%"PRIu64"\n",
2446 le64toh(o->tag.seqnum),
2447 le64toh(o->tag.epoch));
0284adc6 2448 break;
3c1668da
LP
2449
2450 default:
8facc349 2451 printf("Type: unknown (%i)\n", o->object.type);
3c1668da 2452 break;
0284adc6 2453 }
7560fffc 2454
d89c8fdf
ZJS
2455 if (o->object.flags & OBJECT_COMPRESSION_MASK)
2456 printf("Flags: %s\n",
2457 object_compressed_to_string(o->object.flags & OBJECT_COMPRESSION_MASK));
7560fffc 2458
0284adc6
LP
2459 if (p == le64toh(f->header->tail_object_offset))
2460 p = 0;
2461 else
2462 p = p + ALIGN64(le64toh(o->object.size));
2463 }
7560fffc 2464
0284adc6
LP
2465 return;
2466fail:
2467 log_error("File corrupt");
7560fffc
LP
2468}
2469
718fe4b1
ZJS
2470static const char* format_timestamp_safe(char *buf, size_t l, usec_t t) {
2471 const char *x;
2472
2473 x = format_timestamp(buf, l, t);
2474 if (x)
2475 return x;
2476 return " --- ";
2477}
2478
0284adc6 2479void journal_file_print_header(JournalFile *f) {
2765b7bb 2480 char a[33], b[33], c[33], d[33];
ed375beb 2481 char x[FORMAT_TIMESTAMP_MAX], y[FORMAT_TIMESTAMP_MAX], z[FORMAT_TIMESTAMP_MAX];
a1a03e30
LP
2482 struct stat st;
2483 char bytes[FORMAT_BYTES_MAX];
7560fffc
LP
2484
2485 assert(f);
7560fffc 2486
0284adc6
LP
2487 printf("File Path: %s\n"
2488 "File ID: %s\n"
2489 "Machine ID: %s\n"
2490 "Boot ID: %s\n"
2491 "Sequential Number ID: %s\n"
2492 "State: %s\n"
2493 "Compatible Flags:%s%s\n"
d89c8fdf 2494 "Incompatible Flags:%s%s%s\n"
507f22bd
ZJS
2495 "Header size: %"PRIu64"\n"
2496 "Arena size: %"PRIu64"\n"
2497 "Data Hash Table Size: %"PRIu64"\n"
2498 "Field Hash Table Size: %"PRIu64"\n"
0284adc6 2499 "Rotate Suggested: %s\n"
507f22bd
ZJS
2500 "Head Sequential Number: %"PRIu64"\n"
2501 "Tail Sequential Number: %"PRIu64"\n"
0284adc6 2502 "Head Realtime Timestamp: %s\n"
3223f44f 2503 "Tail Realtime Timestamp: %s\n"
ed375beb 2504 "Tail Monotonic Timestamp: %s\n"
507f22bd
ZJS
2505 "Objects: %"PRIu64"\n"
2506 "Entry Objects: %"PRIu64"\n",
0284adc6
LP
2507 f->path,
2508 sd_id128_to_string(f->header->file_id, a),
2509 sd_id128_to_string(f->header->machine_id, b),
2510 sd_id128_to_string(f->header->boot_id, c),
2765b7bb 2511 sd_id128_to_string(f->header->seqnum_id, d),
3223f44f
LP
2512 f->header->state == STATE_OFFLINE ? "OFFLINE" :
2513 f->header->state == STATE_ONLINE ? "ONLINE" :
2514 f->header->state == STATE_ARCHIVED ? "ARCHIVED" : "UNKNOWN",
8088cbd3 2515 JOURNAL_HEADER_SEALED(f->header) ? " SEALED" : "",
d89c8fdf
ZJS
2516 (le32toh(f->header->compatible_flags) & ~HEADER_COMPATIBLE_ANY) ? " ???" : "",
2517 JOURNAL_HEADER_COMPRESSED_XZ(f->header) ? " COMPRESSED-XZ" : "",
2518 JOURNAL_HEADER_COMPRESSED_LZ4(f->header) ? " COMPRESSED-LZ4" : "",
2519 (le32toh(f->header->incompatible_flags) & ~HEADER_INCOMPATIBLE_ANY) ? " ???" : "",
507f22bd
ZJS
2520 le64toh(f->header->header_size),
2521 le64toh(f->header->arena_size),
2522 le64toh(f->header->data_hash_table_size) / sizeof(HashItem),
2523 le64toh(f->header->field_hash_table_size) / sizeof(HashItem),
fb0951b0 2524 yes_no(journal_file_rotate_suggested(f, 0)),
507f22bd
ZJS
2525 le64toh(f->header->head_entry_seqnum),
2526 le64toh(f->header->tail_entry_seqnum),
718fe4b1
ZJS
2527 format_timestamp_safe(x, sizeof(x), le64toh(f->header->head_entry_realtime)),
2528 format_timestamp_safe(y, sizeof(y), le64toh(f->header->tail_entry_realtime)),
ed375beb 2529 format_timespan(z, sizeof(z), le64toh(f->header->tail_entry_monotonic), USEC_PER_MSEC),
507f22bd
ZJS
2530 le64toh(f->header->n_objects),
2531 le64toh(f->header->n_entries));
7560fffc 2532
0284adc6 2533 if (JOURNAL_HEADER_CONTAINS(f->header, n_data))
507f22bd 2534 printf("Data Objects: %"PRIu64"\n"
0284adc6 2535 "Data Hash Table Fill: %.1f%%\n",
507f22bd 2536 le64toh(f->header->n_data),
0284adc6 2537 100.0 * (double) le64toh(f->header->n_data) / ((double) (le64toh(f->header->data_hash_table_size) / sizeof(HashItem))));
7560fffc 2538
0284adc6 2539 if (JOURNAL_HEADER_CONTAINS(f->header, n_fields))
507f22bd 2540 printf("Field Objects: %"PRIu64"\n"
0284adc6 2541 "Field Hash Table Fill: %.1f%%\n",
507f22bd 2542 le64toh(f->header->n_fields),
0284adc6 2543 100.0 * (double) le64toh(f->header->n_fields) / ((double) (le64toh(f->header->field_hash_table_size) / sizeof(HashItem))));
3223f44f
LP
2544
2545 if (JOURNAL_HEADER_CONTAINS(f->header, n_tags))
507f22bd
ZJS
2546 printf("Tag Objects: %"PRIu64"\n",
2547 le64toh(f->header->n_tags));
3223f44f 2548 if (JOURNAL_HEADER_CONTAINS(f->header, n_entry_arrays))
507f22bd
ZJS
2549 printf("Entry Array Objects: %"PRIu64"\n",
2550 le64toh(f->header->n_entry_arrays));
a1a03e30
LP
2551
2552 if (fstat(f->fd, &st) >= 0)
59f448cf 2553 printf("Disk usage: %s\n", format_bytes(bytes, sizeof(bytes), (uint64_t) st.st_blocks * 512ULL));
7560fffc
LP
2554}
2555
fc68c929
LP
2556static int journal_file_warn_btrfs(JournalFile *f) {
2557 unsigned attrs;
2558 int r;
2559
2560 assert(f);
2561
2562 /* Before we write anything, check if the COW logic is turned
2563 * off on btrfs. Given our write pattern that is quite
2564 * unfriendly to COW file systems this should greatly improve
2565 * performance on COW file systems, such as btrfs, at the
2566 * expense of data integrity features (which shouldn't be too
2567 * bad, given that we do our own checksumming). */
2568
2569 r = btrfs_is_filesystem(f->fd);
2570 if (r < 0)
2571 return log_warning_errno(r, "Failed to determine if journal is on btrfs: %m");
2572 if (!r)
2573 return 0;
2574
2575 r = read_attr_fd(f->fd, &attrs);
2576 if (r < 0)
2577 return log_warning_errno(r, "Failed to read file attributes: %m");
2578
2579 if (attrs & FS_NOCOW_FL) {
2580 log_debug("Detected btrfs file system with copy-on-write disabled, all is good.");
2581 return 0;
2582 }
2583
2584 log_notice("Creating journal file %s on a btrfs file system, and copy-on-write is enabled. "
2585 "This is likely to slow down journal access substantially, please consider turning "
2586 "off the copy-on-write file attribute on the journal directory, using chattr +C.", f->path);
2587
2588 return 1;
2589}
2590
0284adc6
LP
2591int journal_file_open(
2592 const char *fname,
2593 int flags,
2594 mode_t mode,
2595 bool compress,
baed47c3 2596 bool seal,
0284adc6
LP
2597 JournalMetrics *metrics,
2598 MMapCache *mmap_cache,
2599 JournalFile *template,
2600 JournalFile **ret) {
7560fffc 2601
fa6ac760 2602 bool newly_created = false;
0284adc6 2603 JournalFile *f;
fa6ac760 2604 void *h;
0284adc6 2605 int r;
7560fffc 2606
0284adc6 2607 assert(fname);
0559d3a5 2608 assert(ret);
7560fffc 2609
0284adc6
LP
2610 if ((flags & O_ACCMODE) != O_RDONLY &&
2611 (flags & O_ACCMODE) != O_RDWR)
2612 return -EINVAL;
7560fffc 2613
a0108012
LP
2614 if (!endswith(fname, ".journal") &&
2615 !endswith(fname, ".journal~"))
0284adc6 2616 return -EINVAL;
7560fffc 2617
0284adc6
LP
2618 f = new0(JournalFile, 1);
2619 if (!f)
2620 return -ENOMEM;
7560fffc 2621
0284adc6
LP
2622 f->fd = -1;
2623 f->mode = mode;
7560fffc 2624
0284adc6
LP
2625 f->flags = flags;
2626 f->prot = prot_from_flags(flags);
2627 f->writable = (flags & O_ACCMODE) != O_RDONLY;
92261977 2628#if defined(HAVE_LZ4)
d89c8fdf
ZJS
2629 f->compress_lz4 = compress;
2630#elif defined(HAVE_XZ)
2631 f->compress_xz = compress;
48b61739 2632#endif
49a32d43 2633#ifdef HAVE_GCRYPT
baed47c3 2634 f->seal = seal;
49a32d43 2635#endif
7560fffc 2636
0284adc6
LP
2637 if (mmap_cache)
2638 f->mmap = mmap_cache_ref(mmap_cache);
2639 else {
84168d80 2640 f->mmap = mmap_cache_new();
0284adc6
LP
2641 if (!f->mmap) {
2642 r = -ENOMEM;
2643 goto fail;
2644 }
2645 }
7560fffc 2646
0284adc6
LP
2647 f->path = strdup(fname);
2648 if (!f->path) {
2649 r = -ENOMEM;
2650 goto fail;
2651 }
7560fffc 2652
4743015d 2653 f->chain_cache = ordered_hashmap_new(&uint64_hash_ops);
a4bcff5b
LP
2654 if (!f->chain_cache) {
2655 r = -ENOMEM;
2656 goto fail;
2657 }
2658
0284adc6
LP
2659 f->fd = open(f->path, f->flags|O_CLOEXEC, f->mode);
2660 if (f->fd < 0) {
2661 r = -errno;
2662 goto fail;
7560fffc 2663 }
7560fffc 2664
2678031a
LP
2665 r = journal_file_fstat(f);
2666 if (r < 0)
0284adc6 2667 goto fail;
7560fffc 2668
0284adc6 2669 if (f->last_stat.st_size == 0 && f->writable) {
11689d2a 2670
fc68c929 2671 (void) journal_file_warn_btrfs(f);
11689d2a 2672
fb0951b0
LP
2673 /* Let's attach the creation time to the journal file,
2674 * so that the vacuuming code knows the age of this
2675 * file even if the file might end up corrupted one
2676 * day... Ideally we'd just use the creation time many
2677 * file systems maintain for each file, but there is
2678 * currently no usable API to query this, hence let's
2679 * emulate this via extended attributes. If extended
2680 * attributes are not supported we'll just skip this,
7517e174 2681 * and rely solely on mtime/atime/ctime of the file. */
fb0951b0 2682
d61b600d 2683 fd_setcrtime(f->fd, 0);
7560fffc 2684
feb12d3e 2685#ifdef HAVE_GCRYPT
0284adc6 2686 /* Try to load the FSPRG state, and if we can't, then
baed47c3 2687 * just don't do sealing */
49a32d43
LP
2688 if (f->seal) {
2689 r = journal_file_fss_load(f);
2690 if (r < 0)
2691 f->seal = false;
2692 }
feb12d3e 2693#endif
7560fffc 2694
0284adc6
LP
2695 r = journal_file_init_header(f, template);
2696 if (r < 0)
2697 goto fail;
7560fffc 2698
2678031a
LP
2699 r = journal_file_fstat(f);
2700 if (r < 0)
0284adc6 2701 goto fail;
fb0951b0
LP
2702
2703 newly_created = true;
0284adc6 2704 }
7560fffc 2705
0284adc6 2706 if (f->last_stat.st_size < (off_t) HEADER_SIZE_MIN) {
cfb571f3 2707 r = -ENODATA;
0284adc6
LP
2708 goto fail;
2709 }
7560fffc 2710
fa6ac760 2711 r = mmap_cache_get(f->mmap, f->fd, f->prot, CONTEXT_HEADER, true, 0, PAGE_ALIGN(sizeof(Header)), &f->last_stat, &h);
977eaa1e 2712 if (r < 0)
0284adc6 2713 goto fail;
7560fffc 2714
fa6ac760
LP
2715 f->header = h;
2716
0284adc6
LP
2717 if (!newly_created) {
2718 r = journal_file_verify_header(f);
2719 if (r < 0)
2720 goto fail;
2721 }
7560fffc 2722
feb12d3e 2723#ifdef HAVE_GCRYPT
0284adc6 2724 if (!newly_created && f->writable) {
baed47c3 2725 r = journal_file_fss_load(f);
0284adc6
LP
2726 if (r < 0)
2727 goto fail;
2728 }
feb12d3e 2729#endif
cec736d2
LP
2730
2731 if (f->writable) {
4a92baf3
LP
2732 if (metrics) {
2733 journal_default_metrics(metrics, f->fd);
2734 f->metrics = *metrics;
2735 } else if (template)
2736 f->metrics = template->metrics;
2737
cec736d2
LP
2738 r = journal_file_refresh_header(f);
2739 if (r < 0)
2740 goto fail;
2741 }
2742
feb12d3e 2743#ifdef HAVE_GCRYPT
baed47c3 2744 r = journal_file_hmac_setup(f);
14d10188
LP
2745 if (r < 0)
2746 goto fail;
feb12d3e 2747#endif
14d10188 2748
cec736d2 2749 if (newly_created) {
de190aef 2750 r = journal_file_setup_field_hash_table(f);
cec736d2
LP
2751 if (r < 0)
2752 goto fail;
2753
de190aef 2754 r = journal_file_setup_data_hash_table(f);
cec736d2
LP
2755 if (r < 0)
2756 goto fail;
7560fffc 2757
feb12d3e 2758#ifdef HAVE_GCRYPT
7560fffc
LP
2759 r = journal_file_append_first_tag(f);
2760 if (r < 0)
2761 goto fail;
feb12d3e 2762#endif
cec736d2
LP
2763 }
2764
fa6ac760
LP
2765 if (mmap_cache_got_sigbus(f->mmap, f->fd)) {
2766 r = -EIO;
2767 goto fail;
2768 }
2769
0559d3a5 2770 *ret = f;
cec736d2
LP
2771 return 0;
2772
2773fail:
fa6ac760
LP
2774 if (f->fd >= 0 && mmap_cache_got_sigbus(f->mmap, f->fd))
2775 r = -EIO;
2776
cec736d2
LP
2777 journal_file_close(f);
2778
2779 return r;
2780}
0ac38b70 2781
baed47c3 2782int journal_file_rotate(JournalFile **f, bool compress, bool seal) {
57535f47 2783 _cleanup_free_ char *p = NULL;
0ac38b70
LP
2784 size_t l;
2785 JournalFile *old_file, *new_file = NULL;
2786 int r;
2787
2788 assert(f);
2789 assert(*f);
2790
2791 old_file = *f;
2792
2793 if (!old_file->writable)
2794 return -EINVAL;
2795
2796 if (!endswith(old_file->path, ".journal"))
2797 return -EINVAL;
2798
2799 l = strlen(old_file->path);
57535f47
ZJS
2800 r = asprintf(&p, "%.*s@" SD_ID128_FORMAT_STR "-%016"PRIx64"-%016"PRIx64".journal",
2801 (int) l - 8, old_file->path,
2802 SD_ID128_FORMAT_VAL(old_file->header->seqnum_id),
2803 le64toh((*f)->header->head_entry_seqnum),
2804 le64toh((*f)->header->head_entry_realtime));
2805 if (r < 0)
0ac38b70
LP
2806 return -ENOMEM;
2807
2678031a
LP
2808 /* Try to rename the file to the archived version. If the file
2809 * already was deleted, we'll get ENOENT, let's ignore that
2810 * case. */
0ac38b70 2811 r = rename(old_file->path, p);
2678031a 2812 if (r < 0 && errno != ENOENT)
0ac38b70
LP
2813 return -errno;
2814
ccdbaf91 2815 old_file->header->state = STATE_ARCHIVED;
0ac38b70 2816
f27a3864
LP
2817 /* Currently, btrfs is not very good with out write patterns
2818 * and fragments heavily. Let's defrag our journal files when
2819 * we archive them */
2820 old_file->defrag_on_close = true;
2821
baed47c3 2822 r = journal_file_open(old_file->path, old_file->flags, old_file->mode, compress, seal, NULL, old_file->mmap, old_file, &new_file);
0ac38b70
LP
2823 journal_file_close(old_file);
2824
2825 *f = new_file;
2826 return r;
2827}
2828
9447a7f1
LP
2829int journal_file_open_reliably(
2830 const char *fname,
2831 int flags,
2832 mode_t mode,
7560fffc 2833 bool compress,
baed47c3 2834 bool seal,
4a92baf3 2835 JournalMetrics *metrics,
27370278 2836 MMapCache *mmap_cache,
9447a7f1
LP
2837 JournalFile *template,
2838 JournalFile **ret) {
2839
2840 int r;
2841 size_t l;
ed375beb 2842 _cleanup_free_ char *p = NULL;
9447a7f1 2843
070052ab 2844 r = journal_file_open(fname, flags, mode, compress, seal, metrics, mmap_cache, template, ret);
288359db
ZJS
2845 if (!IN_SET(r,
2846 -EBADMSG, /* corrupted */
2847 -ENODATA, /* truncated */
2848 -EHOSTDOWN, /* other machine */
2849 -EPROTONOSUPPORT, /* incompatible feature */
2850 -EBUSY, /* unclean shutdown */
2851 -ESHUTDOWN, /* already archived */
2852 -EIO, /* IO error, including SIGBUS on mmap */
2853 -EIDRM /* File has been deleted */))
9447a7f1
LP
2854 return r;
2855
2856 if ((flags & O_ACCMODE) == O_RDONLY)
2857 return r;
2858
2859 if (!(flags & O_CREAT))
2860 return r;
2861
7560fffc
LP
2862 if (!endswith(fname, ".journal"))
2863 return r;
2864
5c70eab4
LP
2865 /* The file is corrupted. Rotate it away and try it again (but only once) */
2866
9447a7f1 2867 l = strlen(fname);
d587eca5 2868 if (asprintf(&p, "%.*s@%016"PRIx64 "-%016"PRIx64 ".journal~",
57535f47 2869 (int) l - 8, fname,
d587eca5 2870 now(CLOCK_REALTIME),
9bf3b535 2871 random_u64()) < 0)
9447a7f1
LP
2872 return -ENOMEM;
2873
65089b82 2874 if (rename(fname, p) < 0)
9447a7f1
LP
2875 return -errno;
2876
f27a3864
LP
2877 /* btrfs doesn't cope well with our write pattern and
2878 * fragments heavily. Let's defrag all files we rotate */
11689d2a
LP
2879
2880 (void) chattr_path(p, false, FS_NOCOW_FL);
f27a3864
LP
2881 (void) btrfs_defrag(p);
2882
65089b82 2883 log_warning_errno(r, "File %s corrupted or uncleanly shut down, renaming and replacing.", fname);
9447a7f1 2884
070052ab 2885 return journal_file_open(fname, flags, mode, compress, seal, metrics, mmap_cache, template, ret);
9447a7f1
LP
2886}
2887
cf244689
LP
2888int journal_file_copy_entry(JournalFile *from, JournalFile *to, Object *o, uint64_t p, uint64_t *seqnum, Object **ret, uint64_t *offset) {
2889 uint64_t i, n;
2890 uint64_t q, xor_hash = 0;
2891 int r;
2892 EntryItem *items;
2893 dual_timestamp ts;
2894
2895 assert(from);
2896 assert(to);
2897 assert(o);
2898 assert(p);
2899
2900 if (!to->writable)
2901 return -EPERM;
2902
2903 ts.monotonic = le64toh(o->entry.monotonic);
2904 ts.realtime = le64toh(o->entry.realtime);
2905
cf244689 2906 n = journal_file_entry_n_items(o);
4faa7004
TA
2907 /* alloca() can't take 0, hence let's allocate at least one */
2908 items = alloca(sizeof(EntryItem) * MAX(1u, n));
cf244689
LP
2909
2910 for (i = 0; i < n; i++) {
4fd052ae
FC
2911 uint64_t l, h;
2912 le64_t le_hash;
cf244689
LP
2913 size_t t;
2914 void *data;
2915 Object *u;
2916
2917 q = le64toh(o->entry.items[i].object_offset);
2918 le_hash = o->entry.items[i].hash;
2919
2920 r = journal_file_move_to_object(from, OBJECT_DATA, q, &o);
2921 if (r < 0)
2922 return r;
2923
2924 if (le_hash != o->data.hash)
2925 return -EBADMSG;
2926
2927 l = le64toh(o->object.size) - offsetof(Object, data.payload);
2928 t = (size_t) l;
2929
2930 /* We hit the limit on 32bit machines */
2931 if ((uint64_t) t != l)
2932 return -E2BIG;
2933
d89c8fdf 2934 if (o->object.flags & OBJECT_COMPRESSION_MASK) {
3b1a55e1 2935#if defined(HAVE_XZ) || defined(HAVE_LZ4)
a7f7d1bd 2936 size_t rsize = 0;
cf244689 2937
d89c8fdf
ZJS
2938 r = decompress_blob(o->object.flags & OBJECT_COMPRESSION_MASK,
2939 o->data.payload, l, &from->compress_buffer, &from->compress_buffer_size, &rsize, 0);
2940 if (r < 0)
2941 return r;
cf244689
LP
2942
2943 data = from->compress_buffer;
2944 l = rsize;
3b1a55e1
ZJS
2945#else
2946 return -EPROTONOSUPPORT;
2947#endif
cf244689
LP
2948 } else
2949 data = o->data.payload;
2950
2951 r = journal_file_append_data(to, data, l, &u, &h);
2952 if (r < 0)
2953 return r;
2954
2955 xor_hash ^= le64toh(u->data.hash);
2956 items[i].object_offset = htole64(h);
2957 items[i].hash = u->data.hash;
2958
2959 r = journal_file_move_to_object(from, OBJECT_ENTRY, p, &o);
2960 if (r < 0)
2961 return r;
2962 }
2963
fa6ac760
LP
2964 r = journal_file_append_entry_internal(to, &ts, xor_hash, items, n, seqnum, ret, offset);
2965
2966 if (mmap_cache_got_sigbus(to->mmap, to->fd))
2967 return -EIO;
2968
2969 return r;
cf244689 2970}
babfc091 2971
8580d1f7
LP
2972void journal_reset_metrics(JournalMetrics *m) {
2973 assert(m);
2974
2975 /* Set everything to "pick automatic values". */
2976
2977 *m = (JournalMetrics) {
2978 .min_use = (uint64_t) -1,
2979 .max_use = (uint64_t) -1,
2980 .min_size = (uint64_t) -1,
2981 .max_size = (uint64_t) -1,
2982 .keep_free = (uint64_t) -1,
2983 .n_max_files = (uint64_t) -1,
2984 };
2985}
2986
babfc091 2987void journal_default_metrics(JournalMetrics *m, int fd) {
8580d1f7 2988 char a[FORMAT_BYTES_MAX], b[FORMAT_BYTES_MAX], c[FORMAT_BYTES_MAX], d[FORMAT_BYTES_MAX], e[FORMAT_BYTES_MAX];
babfc091 2989 struct statvfs ss;
8580d1f7 2990 uint64_t fs_size;
babfc091
LP
2991
2992 assert(m);
2993 assert(fd >= 0);
2994
2995 if (fstatvfs(fd, &ss) >= 0)
2996 fs_size = ss.f_frsize * ss.f_blocks;
8580d1f7
LP
2997 else {
2998 log_debug_errno(errno, "Failed to detremine disk size: %m");
2999 fs_size = 0;
3000 }
babfc091
LP
3001
3002 if (m->max_use == (uint64_t) -1) {
3003
3004 if (fs_size > 0) {
3005 m->max_use = PAGE_ALIGN(fs_size / 10); /* 10% of file system size */
3006
3007 if (m->max_use > DEFAULT_MAX_USE_UPPER)
3008 m->max_use = DEFAULT_MAX_USE_UPPER;
3009
3010 if (m->max_use < DEFAULT_MAX_USE_LOWER)
3011 m->max_use = DEFAULT_MAX_USE_LOWER;
3012 } else
3013 m->max_use = DEFAULT_MAX_USE_LOWER;
3014 } else {
3015 m->max_use = PAGE_ALIGN(m->max_use);
3016
8580d1f7 3017 if (m->max_use != 0 && m->max_use < JOURNAL_FILE_SIZE_MIN*2)
babfc091
LP
3018 m->max_use = JOURNAL_FILE_SIZE_MIN*2;
3019 }
3020
8580d1f7
LP
3021 if (m->min_use == (uint64_t) -1)
3022 m->min_use = DEFAULT_MIN_USE;
3023
3024 if (m->min_use > m->max_use)
3025 m->min_use = m->max_use;
3026
babfc091
LP
3027 if (m->max_size == (uint64_t) -1) {
3028 m->max_size = PAGE_ALIGN(m->max_use / 8); /* 8 chunks */
3029
3030 if (m->max_size > DEFAULT_MAX_SIZE_UPPER)
3031 m->max_size = DEFAULT_MAX_SIZE_UPPER;
3032 } else
3033 m->max_size = PAGE_ALIGN(m->max_size);
3034
8580d1f7
LP
3035 if (m->max_size != 0) {
3036 if (m->max_size < JOURNAL_FILE_SIZE_MIN)
3037 m->max_size = JOURNAL_FILE_SIZE_MIN;
babfc091 3038
8580d1f7
LP
3039 if (m->max_use != 0 && m->max_size*2 > m->max_use)
3040 m->max_use = m->max_size*2;
3041 }
babfc091
LP
3042
3043 if (m->min_size == (uint64_t) -1)
3044 m->min_size = JOURNAL_FILE_SIZE_MIN;
3045 else {
3046 m->min_size = PAGE_ALIGN(m->min_size);
3047
3048 if (m->min_size < JOURNAL_FILE_SIZE_MIN)
3049 m->min_size = JOURNAL_FILE_SIZE_MIN;
3050
8580d1f7 3051 if (m->max_size != 0 && m->min_size > m->max_size)
babfc091
LP
3052 m->max_size = m->min_size;
3053 }
3054
3055 if (m->keep_free == (uint64_t) -1) {
3056
3057 if (fs_size > 0) {
8621b110 3058 m->keep_free = PAGE_ALIGN(fs_size * 3 / 20); /* 15% of file system size */
babfc091
LP
3059
3060 if (m->keep_free > DEFAULT_KEEP_FREE_UPPER)
3061 m->keep_free = DEFAULT_KEEP_FREE_UPPER;
3062
3063 } else
3064 m->keep_free = DEFAULT_KEEP_FREE;
3065 }
3066
8580d1f7
LP
3067 if (m->n_max_files == (uint64_t) -1)
3068 m->n_max_files = DEFAULT_N_MAX_FILES;
3069
3070 log_debug("Fixed min_use=%s max_use=%s max_size=%s min_size=%s keep_free=%s n_max_files=%" PRIu64,
3071 format_bytes(a, sizeof(a), m->min_use),
3072 format_bytes(b, sizeof(b), m->max_use),
3073 format_bytes(c, sizeof(c), m->max_size),
3074 format_bytes(d, sizeof(d), m->min_size),
3075 format_bytes(e, sizeof(e), m->keep_free),
3076 m->n_max_files);
babfc091 3077}
08984293
LP
3078
3079int journal_file_get_cutoff_realtime_usec(JournalFile *f, usec_t *from, usec_t *to) {
08984293
LP
3080 assert(f);
3081 assert(from || to);
3082
3083 if (from) {
162566a4
LP
3084 if (f->header->head_entry_realtime == 0)
3085 return -ENOENT;
08984293 3086
162566a4 3087 *from = le64toh(f->header->head_entry_realtime);
08984293
LP
3088 }
3089
3090 if (to) {
162566a4
LP
3091 if (f->header->tail_entry_realtime == 0)
3092 return -ENOENT;
08984293 3093
162566a4 3094 *to = le64toh(f->header->tail_entry_realtime);
08984293
LP
3095 }
3096
3097 return 1;
3098}
3099
3100int journal_file_get_cutoff_monotonic_usec(JournalFile *f, sd_id128_t boot_id, usec_t *from, usec_t *to) {
08984293
LP
3101 Object *o;
3102 uint64_t p;
3103 int r;
3104
3105 assert(f);
3106 assert(from || to);
3107
47838ab3 3108 r = find_data_object_by_boot_id(f, boot_id, &o, &p);
08984293
LP
3109 if (r <= 0)
3110 return r;
3111
3112 if (le64toh(o->data.n_entries) <= 0)
3113 return 0;
3114
3115 if (from) {
3116 r = journal_file_move_to_object(f, OBJECT_ENTRY, le64toh(o->data.entry_offset), &o);
3117 if (r < 0)
3118 return r;
3119
3120 *from = le64toh(o->entry.monotonic);
3121 }
3122
3123 if (to) {
3124 r = journal_file_move_to_object(f, OBJECT_DATA, p, &o);
3125 if (r < 0)
3126 return r;
3127
3128 r = generic_array_get_plus_one(f,
3129 le64toh(o->data.entry_offset),
3130 le64toh(o->data.entry_array_offset),
3131 le64toh(o->data.n_entries)-1,
3132 &o, NULL);
3133 if (r <= 0)
3134 return r;
3135
3136 *to = le64toh(o->entry.monotonic);
3137 }
3138
3139 return 1;
3140}
dca6219e 3141
fb0951b0 3142bool journal_file_rotate_suggested(JournalFile *f, usec_t max_file_usec) {
dca6219e
LP
3143 assert(f);
3144
3145 /* If we gained new header fields we gained new features,
3146 * hence suggest a rotation */
361f9cbc
LP
3147 if (le64toh(f->header->header_size) < sizeof(Header)) {
3148 log_debug("%s uses an outdated header, suggesting rotation.", f->path);
dca6219e 3149 return true;
361f9cbc 3150 }
dca6219e
LP
3151
3152 /* Let's check if the hash tables grew over a certain fill
3153 * level (75%, borrowing this value from Java's hash table
3154 * implementation), and if so suggest a rotation. To calculate
3155 * the fill level we need the n_data field, which only exists
3156 * in newer versions. */
3157
3158 if (JOURNAL_HEADER_CONTAINS(f->header, n_data))
361f9cbc 3159 if (le64toh(f->header->n_data) * 4ULL > (le64toh(f->header->data_hash_table_size) / sizeof(HashItem)) * 3ULL) {
507f22bd 3160 log_debug("Data hash table of %s has a fill level at %.1f (%"PRIu64" of %"PRIu64" items, %llu file size, %"PRIu64" bytes per hash table item), suggesting rotation.",
361f9cbc
LP
3161 f->path,
3162 100.0 * (double) le64toh(f->header->n_data) / ((double) (le64toh(f->header->data_hash_table_size) / sizeof(HashItem))),
507f22bd
ZJS
3163 le64toh(f->header->n_data),
3164 le64toh(f->header->data_hash_table_size) / sizeof(HashItem),
3165 (unsigned long long) f->last_stat.st_size,
3166 f->last_stat.st_size / le64toh(f->header->n_data));
dca6219e 3167 return true;
361f9cbc 3168 }
dca6219e
LP
3169
3170 if (JOURNAL_HEADER_CONTAINS(f->header, n_fields))
361f9cbc 3171 if (le64toh(f->header->n_fields) * 4ULL > (le64toh(f->header->field_hash_table_size) / sizeof(HashItem)) * 3ULL) {
507f22bd 3172 log_debug("Field hash table of %s has a fill level at %.1f (%"PRIu64" of %"PRIu64" items), suggesting rotation.",
361f9cbc
LP
3173 f->path,
3174 100.0 * (double) le64toh(f->header->n_fields) / ((double) (le64toh(f->header->field_hash_table_size) / sizeof(HashItem))),
507f22bd
ZJS
3175 le64toh(f->header->n_fields),
3176 le64toh(f->header->field_hash_table_size) / sizeof(HashItem));
dca6219e 3177 return true;
361f9cbc 3178 }
dca6219e 3179
0598fd4a
LP
3180 /* Are the data objects properly indexed by field objects? */
3181 if (JOURNAL_HEADER_CONTAINS(f->header, n_data) &&
3182 JOURNAL_HEADER_CONTAINS(f->header, n_fields) &&
3183 le64toh(f->header->n_data) > 0 &&
3184 le64toh(f->header->n_fields) == 0)
3185 return true;
3186
fb0951b0
LP
3187 if (max_file_usec > 0) {
3188 usec_t t, h;
3189
3190 h = le64toh(f->header->head_entry_realtime);
3191 t = now(CLOCK_REALTIME);
3192
3193 if (h > 0 && t > h + max_file_usec)
3194 return true;
3195 }
3196
dca6219e
LP
3197 return false;
3198}