]> git.ipfire.org Git - thirdparty/systemd.git/blame - src/journal/journal-file.c
util-lib: split string parsing related calls from util.[ch] into parse-util.[ch]
[thirdparty/systemd.git] / src / journal / journal-file.c
CommitLineData
cec736d2
LP
1/*-*- Mode: C; c-basic-offset: 8; indent-tabs-mode: nil -*-*/
2
3/***
4 This file is part of systemd.
5
6 Copyright 2011 Lennart Poettering
7
8 systemd is free software; you can redistribute it and/or modify it
5430f7f2
LP
9 under the terms of the GNU Lesser General Public License as published by
10 the Free Software Foundation; either version 2.1 of the License, or
cec736d2
LP
11 (at your option) any later version.
12
13 systemd is distributed in the hope that it will be useful, but
14 WITHOUT ANY WARRANTY; without even the implied warranty of
15 MERCHANTABILITY or FITNESS FOR A PARTICULAR PURPOSE. See the GNU
5430f7f2 16 Lesser General Public License for more details.
cec736d2 17
5430f7f2 18 You should have received a copy of the GNU Lesser General Public License
cec736d2
LP
19 along with systemd; If not, see <http://www.gnu.org/licenses/>.
20***/
21
cec736d2 22#include <errno.h>
cec736d2 23#include <fcntl.h>
11689d2a 24#include <linux/fs.h>
07630cea
LP
25#include <stddef.h>
26#include <sys/mman.h>
27#include <sys/statvfs.h>
28#include <sys/uio.h>
29#include <unistd.h>
fb0951b0 30
f27a3864 31#include "btrfs-util.h"
07630cea 32#include "compress.h"
3ffd4af2 33#include "fd-util.h"
0284adc6 34#include "journal-authenticate.h"
cec736d2
LP
35#include "journal-def.h"
36#include "journal-file.h"
37#include "lookup3.h"
6bedfcbb 38#include "parse-util.h"
3df3e884 39#include "random-util.h"
07630cea 40#include "string-util.h"
cec736d2 41
4a92baf3
LP
42#define DEFAULT_DATA_HASH_TABLE_SIZE (2047ULL*sizeof(HashItem))
43#define DEFAULT_FIELD_HASH_TABLE_SIZE (333ULL*sizeof(HashItem))
cec736d2 44
be19b7df 45#define COMPRESSION_SIZE_THRESHOLD (512ULL)
807e17f0 46
babfc091 47/* This is the minimum journal file size */
253f59df 48#define JOURNAL_FILE_SIZE_MIN (4ULL*1024ULL*1024ULL) /* 4 MiB */
babfc091
LP
49
50/* These are the lower and upper bounds if we deduce the max_use value
51 * from the file system size */
52#define DEFAULT_MAX_USE_LOWER (1ULL*1024ULL*1024ULL) /* 1 MiB */
53#define DEFAULT_MAX_USE_UPPER (4ULL*1024ULL*1024ULL*1024ULL) /* 4 GiB */
54
8580d1f7
LP
55/* This is the default minimal use limit, how much we'll use even if keep_free suggests otherwise. */
56#define DEFAULT_MIN_USE (1ULL*1024ULL*1024ULL) /* 1 MiB */
57
babfc091 58/* This is the upper bound if we deduce max_size from max_use */
71100051 59#define DEFAULT_MAX_SIZE_UPPER (128ULL*1024ULL*1024ULL) /* 128 MiB */
babfc091
LP
60
61/* This is the upper bound if we deduce the keep_free value from the
62 * file system size */
63#define DEFAULT_KEEP_FREE_UPPER (4ULL*1024ULL*1024ULL*1024ULL) /* 4 GiB */
64
65/* This is the keep_free value when we can't determine the system
66 * size */
67#define DEFAULT_KEEP_FREE (1024ULL*1024ULL) /* 1 MB */
68
8580d1f7
LP
69/* This is the default maximum number of journal files to keep around. */
70#define DEFAULT_N_MAX_FILES (100)
71
dca6219e
LP
72/* n_data was the first entry we added after the initial file format design */
73#define HEADER_SIZE_MIN ALIGN64(offsetof(Header, n_data))
cec736d2 74
a4bcff5b
LP
75/* How many entries to keep in the entry array chain cache at max */
76#define CHAIN_CACHE_MAX 20
77
a676e665
LP
78/* How much to increase the journal file size at once each time we allocate something new. */
79#define FILE_SIZE_INCREASE (8ULL*1024ULL*1024ULL) /* 8MB */
80
2678031a
LP
81/* Reread fstat() of the file for detecting deletions at least this often */
82#define LAST_STAT_REFRESH_USEC (5*USEC_PER_SEC)
83
fa6ac760
LP
84/* The mmap context to use for the header we pick as one above the last defined typed */
85#define CONTEXT_HEADER _OBJECT_TYPE_MAX
86
9588bc32 87static int journal_file_set_online(JournalFile *f) {
26687bf8
OS
88 assert(f);
89
90 if (!f->writable)
91 return -EPERM;
92
93 if (!(f->fd >= 0 && f->header))
94 return -EINVAL;
95
fa6ac760
LP
96 if (mmap_cache_got_sigbus(f->mmap, f->fd))
97 return -EIO;
98
26687bf8
OS
99 switch(f->header->state) {
100 case STATE_ONLINE:
101 return 0;
102
103 case STATE_OFFLINE:
104 f->header->state = STATE_ONLINE;
105 fsync(f->fd);
106 return 0;
107
108 default:
109 return -EINVAL;
110 }
111}
112
113int journal_file_set_offline(JournalFile *f) {
114 assert(f);
115
116 if (!f->writable)
117 return -EPERM;
118
119 if (!(f->fd >= 0 && f->header))
120 return -EINVAL;
121
122 if (f->header->state != STATE_ONLINE)
123 return 0;
124
125 fsync(f->fd);
126
fa6ac760
LP
127 if (mmap_cache_got_sigbus(f->mmap, f->fd))
128 return -EIO;
129
26687bf8
OS
130 f->header->state = STATE_OFFLINE;
131
fa6ac760
LP
132 if (mmap_cache_got_sigbus(f->mmap, f->fd))
133 return -EIO;
134
26687bf8
OS
135 fsync(f->fd);
136
137 return 0;
138}
139
804ae586 140JournalFile* journal_file_close(JournalFile *f) {
de190aef 141 assert(f);
cec736d2 142
feb12d3e 143#ifdef HAVE_GCRYPT
b0af6f41 144 /* Write the final tag */
c586dbf1 145 if (f->seal && f->writable)
b0af6f41 146 journal_file_append_tag(f);
feb12d3e 147#endif
b0af6f41 148
26687bf8 149 journal_file_set_offline(f);
cec736d2 150
fa6ac760
LP
151 if (f->mmap && f->fd >= 0)
152 mmap_cache_close_fd(f->mmap, f->fd);
cec736d2 153
11689d2a
LP
154 if (f->fd >= 0 && f->defrag_on_close) {
155
156 /* Be friendly to btrfs: turn COW back on again now,
157 * and defragment the file. We won't write to the file
158 * ever again, hence remove all fragmentation, and
159 * reenable all the good bits COW usually provides
160 * (such as data checksumming). */
161
1ed8f8c1 162 (void) chattr_fd(f->fd, 0, FS_NOCOW_FL);
11689d2a
LP
163 (void) btrfs_defrag_fd(f->fd);
164 }
f27a3864 165
03e334a1 166 safe_close(f->fd);
cec736d2 167 free(f->path);
807e17f0 168
16e9f408
LP
169 if (f->mmap)
170 mmap_cache_unref(f->mmap);
171
4743015d 172 ordered_hashmap_free_free(f->chain_cache);
a4bcff5b 173
d89c8fdf 174#if defined(HAVE_XZ) || defined(HAVE_LZ4)
807e17f0
LP
175 free(f->compress_buffer);
176#endif
177
7560fffc 178#ifdef HAVE_GCRYPT
baed47c3
LP
179 if (f->fss_file)
180 munmap(f->fss_file, PAGE_ALIGN(f->fss_file_size));
dc4ebc07 181 else
b7c9ae91
LP
182 free(f->fsprg_state);
183
184 free(f->fsprg_seed);
7560fffc
LP
185
186 if (f->hmac)
187 gcry_md_close(f->hmac);
188#endif
189
cec736d2 190 free(f);
804ae586 191 return NULL;
cec736d2
LP
192}
193
0ac38b70 194static int journal_file_init_header(JournalFile *f, JournalFile *template) {
d89c8fdf 195 Header h = {};
cec736d2
LP
196 ssize_t k;
197 int r;
198
199 assert(f);
200
7560fffc 201 memcpy(h.signature, HEADER_SIGNATURE, 8);
23b0b2b2 202 h.header_size = htole64(ALIGN64(sizeof(h)));
cec736d2 203
d89c8fdf
ZJS
204 h.incompatible_flags |= htole32(
205 f->compress_xz * HEADER_INCOMPATIBLE_COMPRESSED_XZ |
206 f->compress_lz4 * HEADER_INCOMPATIBLE_COMPRESSED_LZ4);
7560fffc 207
d89c8fdf
ZJS
208 h.compatible_flags = htole32(
209 f->seal * HEADER_COMPATIBLE_SEALED);
7560fffc 210
cec736d2
LP
211 r = sd_id128_randomize(&h.file_id);
212 if (r < 0)
213 return r;
214
0ac38b70
LP
215 if (template) {
216 h.seqnum_id = template->header->seqnum_id;
beec0085 217 h.tail_entry_seqnum = template->header->tail_entry_seqnum;
0ac38b70
LP
218 } else
219 h.seqnum_id = h.file_id;
cec736d2
LP
220
221 k = pwrite(f->fd, &h, sizeof(h), 0);
222 if (k < 0)
223 return -errno;
224
225 if (k != sizeof(h))
226 return -EIO;
227
228 return 0;
229}
230
231static int journal_file_refresh_header(JournalFile *f) {
de190aef 232 sd_id128_t boot_id;
fa6ac760 233 int r;
cec736d2
LP
234
235 assert(f);
236
237 r = sd_id128_get_machine(&f->header->machine_id);
238 if (r < 0)
239 return r;
240
de190aef 241 r = sd_id128_get_boot(&boot_id);
cec736d2
LP
242 if (r < 0)
243 return r;
244
de190aef
LP
245 if (sd_id128_equal(boot_id, f->header->boot_id))
246 f->tail_entry_monotonic_valid = true;
247
248 f->header->boot_id = boot_id;
249
fa6ac760 250 r = journal_file_set_online(f);
b788cc23 251
7560fffc 252 /* Sync the online state to disk */
a676e665 253 fsync(f->fd);
b788cc23 254
fa6ac760 255 return r;
cec736d2
LP
256}
257
258static int journal_file_verify_header(JournalFile *f) {
d89c8fdf
ZJS
259 uint32_t flags;
260
cec736d2
LP
261 assert(f);
262
7560fffc 263 if (memcmp(f->header->signature, HEADER_SIGNATURE, 8))
cec736d2
LP
264 return -EBADMSG;
265
7560fffc
LP
266 /* In both read and write mode we refuse to open files with
267 * incompatible flags we don't know */
d89c8fdf
ZJS
268 flags = le32toh(f->header->incompatible_flags);
269 if (flags & ~HEADER_INCOMPATIBLE_SUPPORTED) {
270 if (flags & ~HEADER_INCOMPATIBLE_ANY)
271 log_debug("Journal file %s has unknown incompatible flags %"PRIx32,
272 f->path, flags & ~HEADER_INCOMPATIBLE_ANY);
273 flags = (flags & HEADER_INCOMPATIBLE_ANY) & ~HEADER_INCOMPATIBLE_SUPPORTED;
274 if (flags)
275 log_debug("Journal file %s uses incompatible flags %"PRIx32
276 " disabled at compilation time.", f->path, flags);
cec736d2 277 return -EPROTONOSUPPORT;
d89c8fdf 278 }
cec736d2 279
7560fffc
LP
280 /* When open for writing we refuse to open files with
281 * compatible flags, too */
d89c8fdf
ZJS
282 flags = le32toh(f->header->compatible_flags);
283 if (f->writable && (flags & ~HEADER_COMPATIBLE_SUPPORTED)) {
284 if (flags & ~HEADER_COMPATIBLE_ANY)
285 log_debug("Journal file %s has unknown compatible flags %"PRIx32,
286 f->path, flags & ~HEADER_COMPATIBLE_ANY);
287 flags = (flags & HEADER_COMPATIBLE_ANY) & ~HEADER_COMPATIBLE_SUPPORTED;
288 if (flags)
289 log_debug("Journal file %s uses compatible flags %"PRIx32
290 " disabled at compilation time.", f->path, flags);
291 return -EPROTONOSUPPORT;
7560fffc
LP
292 }
293
db11ac1a
LP
294 if (f->header->state >= _STATE_MAX)
295 return -EBADMSG;
296
dca6219e
LP
297 /* The first addition was n_data, so check that we are at least this large */
298 if (le64toh(f->header->header_size) < HEADER_SIZE_MIN)
23b0b2b2
LP
299 return -EBADMSG;
300
8088cbd3 301 if (JOURNAL_HEADER_SEALED(f->header) && !JOURNAL_HEADER_CONTAINS(f->header, n_entry_arrays))
beec0085
LP
302 return -EBADMSG;
303
db11ac1a
LP
304 if ((le64toh(f->header->header_size) + le64toh(f->header->arena_size)) > (uint64_t) f->last_stat.st_size)
305 return -ENODATA;
306
307 if (le64toh(f->header->tail_object_offset) > (le64toh(f->header->header_size) + le64toh(f->header->arena_size)))
308 return -ENODATA;
309
7762e02b
LP
310 if (!VALID64(le64toh(f->header->data_hash_table_offset)) ||
311 !VALID64(le64toh(f->header->field_hash_table_offset)) ||
312 !VALID64(le64toh(f->header->tail_object_offset)) ||
313 !VALID64(le64toh(f->header->entry_array_offset)))
314 return -ENODATA;
315
cec736d2 316 if (f->writable) {
ccdbaf91 317 uint8_t state;
cec736d2
LP
318 sd_id128_t machine_id;
319 int r;
320
321 r = sd_id128_get_machine(&machine_id);
322 if (r < 0)
323 return r;
324
325 if (!sd_id128_equal(machine_id, f->header->machine_id))
326 return -EHOSTDOWN;
327
de190aef 328 state = f->header->state;
cec736d2 329
71fa6f00
LP
330 if (state == STATE_ONLINE) {
331 log_debug("Journal file %s is already online. Assuming unclean closing.", f->path);
332 return -EBUSY;
333 } else if (state == STATE_ARCHIVED)
cec736d2 334 return -ESHUTDOWN;
71fa6f00 335 else if (state != STATE_OFFLINE) {
8facc349 336 log_debug("Journal file %s has unknown state %i.", f->path, state);
71fa6f00
LP
337 return -EBUSY;
338 }
cec736d2
LP
339 }
340
d89c8fdf
ZJS
341 f->compress_xz = JOURNAL_HEADER_COMPRESSED_XZ(f->header);
342 f->compress_lz4 = JOURNAL_HEADER_COMPRESSED_LZ4(f->header);
c586dbf1 343
f1889c91 344 f->seal = JOURNAL_HEADER_SEALED(f->header);
7560fffc 345
cec736d2
LP
346 return 0;
347}
348
2678031a
LP
349static int journal_file_fstat(JournalFile *f) {
350 assert(f);
351 assert(f->fd >= 0);
352
353 if (fstat(f->fd, &f->last_stat) < 0)
354 return -errno;
355
356 f->last_stat_usec = now(CLOCK_MONOTONIC);
357
358 /* Refuse appending to files that are already deleted */
359 if (f->last_stat.st_nlink <= 0)
360 return -EIDRM;
361
362 return 0;
363}
364
cec736d2 365static int journal_file_allocate(JournalFile *f, uint64_t offset, uint64_t size) {
eda4b58b 366 uint64_t old_size, new_size;
fec2aa2f 367 int r;
cec736d2
LP
368
369 assert(f);
370
cec736d2 371 /* We assume that this file is not sparse, and we know that
38ac38b2 372 * for sure, since we always call posix_fallocate()
cec736d2
LP
373 * ourselves */
374
fa6ac760
LP
375 if (mmap_cache_got_sigbus(f->mmap, f->fd))
376 return -EIO;
377
cec736d2 378 old_size =
23b0b2b2 379 le64toh(f->header->header_size) +
cec736d2
LP
380 le64toh(f->header->arena_size);
381
bc85bfee 382 new_size = PAGE_ALIGN(offset + size);
23b0b2b2
LP
383 if (new_size < le64toh(f->header->header_size))
384 new_size = le64toh(f->header->header_size);
bc85bfee 385
2678031a
LP
386 if (new_size <= old_size) {
387
388 /* We already pre-allocated enough space, but before
389 * we write to it, let's check with fstat() if the
390 * file got deleted, in order make sure we don't throw
391 * away the data immediately. Don't check fstat() for
392 * all writes though, but only once ever 10s. */
393
394 if (f->last_stat_usec + LAST_STAT_REFRESH_USEC > now(CLOCK_MONOTONIC))
395 return 0;
396
397 return journal_file_fstat(f);
398 }
399
400 /* Allocate more space. */
cec736d2 401
a676e665 402 if (f->metrics.max_size > 0 && new_size > f->metrics.max_size)
bc85bfee 403 return -E2BIG;
cec736d2 404
a676e665 405 if (new_size > f->metrics.min_size && f->metrics.keep_free > 0) {
cec736d2
LP
406 struct statvfs svfs;
407
408 if (fstatvfs(f->fd, &svfs) >= 0) {
409 uint64_t available;
410
070052ab 411 available = LESS_BY((uint64_t) svfs.f_bfree * (uint64_t) svfs.f_bsize, f->metrics.keep_free);
cec736d2
LP
412
413 if (new_size - old_size > available)
414 return -E2BIG;
415 }
416 }
417
eda4b58b
LP
418 /* Increase by larger blocks at once */
419 new_size = ((new_size+FILE_SIZE_INCREASE-1) / FILE_SIZE_INCREASE) * FILE_SIZE_INCREASE;
420 if (f->metrics.max_size > 0 && new_size > f->metrics.max_size)
421 new_size = f->metrics.max_size;
422
bc85bfee
LP
423 /* Note that the glibc fallocate() fallback is very
424 inefficient, hence we try to minimize the allocation area
425 as we can. */
fec2aa2f
GV
426 r = posix_fallocate(f->fd, old_size, new_size - old_size);
427 if (r != 0)
428 return -r;
cec736d2 429
23b0b2b2 430 f->header->arena_size = htole64(new_size - le64toh(f->header->header_size));
cec736d2 431
2678031a 432 return journal_file_fstat(f);
cec736d2
LP
433}
434
78519831 435static unsigned type_to_context(ObjectType type) {
d3d3208f 436 /* One context for each type, plus one catch-all for the rest */
69adae51 437 assert_cc(_OBJECT_TYPE_MAX <= MMAP_CACHE_MAX_CONTEXTS);
fa6ac760 438 assert_cc(CONTEXT_HEADER < MMAP_CACHE_MAX_CONTEXTS);
d05089d8 439 return type > OBJECT_UNUSED && type < _OBJECT_TYPE_MAX ? type : 0;
d3d3208f
MS
440}
441
7a9dabea 442static int journal_file_move_to(JournalFile *f, ObjectType type, bool keep_always, uint64_t offset, uint64_t size, void **ret) {
2678031a
LP
443 int r;
444
cec736d2 445 assert(f);
cec736d2
LP
446 assert(ret);
447
7762e02b
LP
448 if (size <= 0)
449 return -EINVAL;
450
2a59ea54 451 /* Avoid SIGBUS on invalid accesses */
4bbdcdb3
LP
452 if (offset + size > (uint64_t) f->last_stat.st_size) {
453 /* Hmm, out of range? Let's refresh the fstat() data
454 * first, before we trust that check. */
455
2678031a
LP
456 r = journal_file_fstat(f);
457 if (r < 0)
458 return r;
459
460 if (offset + size > (uint64_t) f->last_stat.st_size)
4bbdcdb3
LP
461 return -EADDRNOTAVAIL;
462 }
463
7a9dabea 464 return mmap_cache_get(f->mmap, f->fd, f->prot, type_to_context(type), keep_always, offset, size, &f->last_stat, ret);
cec736d2
LP
465}
466
16e9f408
LP
467static uint64_t minimum_header_size(Object *o) {
468
b8e891e6 469 static const uint64_t table[] = {
16e9f408
LP
470 [OBJECT_DATA] = sizeof(DataObject),
471 [OBJECT_FIELD] = sizeof(FieldObject),
472 [OBJECT_ENTRY] = sizeof(EntryObject),
473 [OBJECT_DATA_HASH_TABLE] = sizeof(HashTableObject),
474 [OBJECT_FIELD_HASH_TABLE] = sizeof(HashTableObject),
475 [OBJECT_ENTRY_ARRAY] = sizeof(EntryArrayObject),
476 [OBJECT_TAG] = sizeof(TagObject),
477 };
478
479 if (o->object.type >= ELEMENTSOF(table) || table[o->object.type] <= 0)
480 return sizeof(ObjectHeader);
481
482 return table[o->object.type];
483}
484
78519831 485int journal_file_move_to_object(JournalFile *f, ObjectType type, uint64_t offset, Object **ret) {
cec736d2
LP
486 int r;
487 void *t;
488 Object *o;
489 uint64_t s;
490
491 assert(f);
492 assert(ret);
493
db11ac1a
LP
494 /* Objects may only be located at multiple of 64 bit */
495 if (!VALID64(offset))
496 return -EFAULT;
497
7a9dabea 498 r = journal_file_move_to(f, type, false, offset, sizeof(ObjectHeader), &t);
cec736d2
LP
499 if (r < 0)
500 return r;
501
502 o = (Object*) t;
503 s = le64toh(o->object.size);
504
505 if (s < sizeof(ObjectHeader))
506 return -EBADMSG;
507
16e9f408
LP
508 if (o->object.type <= OBJECT_UNUSED)
509 return -EBADMSG;
510
511 if (s < minimum_header_size(o))
512 return -EBADMSG;
513
d05089d8 514 if (type > OBJECT_UNUSED && o->object.type != type)
cec736d2
LP
515 return -EBADMSG;
516
517 if (s > sizeof(ObjectHeader)) {
7a9dabea 518 r = journal_file_move_to(f, type, false, offset, s, &t);
cec736d2
LP
519 if (r < 0)
520 return r;
521
522 o = (Object*) t;
523 }
524
cec736d2
LP
525 *ret = o;
526 return 0;
527}
528
d98cc1f2 529static uint64_t journal_file_entry_seqnum(JournalFile *f, uint64_t *seqnum) {
cec736d2
LP
530 uint64_t r;
531
532 assert(f);
533
beec0085 534 r = le64toh(f->header->tail_entry_seqnum) + 1;
c2373f84
LP
535
536 if (seqnum) {
de190aef 537 /* If an external seqnum counter was passed, we update
c2373f84
LP
538 * both the local and the external one, and set it to
539 * the maximum of both */
540
541 if (*seqnum + 1 > r)
542 r = *seqnum + 1;
543
544 *seqnum = r;
545 }
546
beec0085 547 f->header->tail_entry_seqnum = htole64(r);
cec736d2 548
beec0085
LP
549 if (f->header->head_entry_seqnum == 0)
550 f->header->head_entry_seqnum = htole64(r);
de190aef 551
cec736d2
LP
552 return r;
553}
554
78519831 555int journal_file_append_object(JournalFile *f, ObjectType type, uint64_t size, Object **ret, uint64_t *offset) {
cec736d2
LP
556 int r;
557 uint64_t p;
558 Object *tail, *o;
559 void *t;
560
561 assert(f);
d05089d8 562 assert(type > OBJECT_UNUSED && type < _OBJECT_TYPE_MAX);
cec736d2
LP
563 assert(size >= sizeof(ObjectHeader));
564 assert(offset);
565 assert(ret);
566
26687bf8
OS
567 r = journal_file_set_online(f);
568 if (r < 0)
569 return r;
570
cec736d2 571 p = le64toh(f->header->tail_object_offset);
cec736d2 572 if (p == 0)
23b0b2b2 573 p = le64toh(f->header->header_size);
cec736d2 574 else {
d05089d8 575 r = journal_file_move_to_object(f, OBJECT_UNUSED, p, &tail);
cec736d2
LP
576 if (r < 0)
577 return r;
578
579 p += ALIGN64(le64toh(tail->object.size));
580 }
581
582 r = journal_file_allocate(f, p, size);
583 if (r < 0)
584 return r;
585
fcde2389 586 r = journal_file_move_to(f, type, false, p, size, &t);
cec736d2
LP
587 if (r < 0)
588 return r;
589
590 o = (Object*) t;
591
592 zero(o->object);
de190aef 593 o->object.type = type;
cec736d2
LP
594 o->object.size = htole64(size);
595
596 f->header->tail_object_offset = htole64(p);
cec736d2
LP
597 f->header->n_objects = htole64(le64toh(f->header->n_objects) + 1);
598
599 *ret = o;
600 *offset = p;
601
602 return 0;
603}
604
de190aef 605static int journal_file_setup_data_hash_table(JournalFile *f) {
cec736d2
LP
606 uint64_t s, p;
607 Object *o;
608 int r;
609
610 assert(f);
611
070052ab
LP
612 /* We estimate that we need 1 hash table entry per 768 bytes
613 of journal file and we want to make sure we never get
614 beyond 75% fill level. Calculate the hash table size for
615 the maximum file size based on these metrics. */
4a92baf3 616
dfabe643 617 s = (f->metrics.max_size * 4 / 768 / 3) * sizeof(HashItem);
4a92baf3
LP
618 if (s < DEFAULT_DATA_HASH_TABLE_SIZE)
619 s = DEFAULT_DATA_HASH_TABLE_SIZE;
620
507f22bd 621 log_debug("Reserving %"PRIu64" entries in hash table.", s / sizeof(HashItem));
4a92baf3 622
de190aef
LP
623 r = journal_file_append_object(f,
624 OBJECT_DATA_HASH_TABLE,
625 offsetof(Object, hash_table.items) + s,
626 &o, &p);
cec736d2
LP
627 if (r < 0)
628 return r;
629
29804cc1 630 memzero(o->hash_table.items, s);
cec736d2 631
de190aef
LP
632 f->header->data_hash_table_offset = htole64(p + offsetof(Object, hash_table.items));
633 f->header->data_hash_table_size = htole64(s);
cec736d2
LP
634
635 return 0;
636}
637
de190aef 638static int journal_file_setup_field_hash_table(JournalFile *f) {
cec736d2
LP
639 uint64_t s, p;
640 Object *o;
641 int r;
642
643 assert(f);
644
3c1668da
LP
645 /* We use a fixed size hash table for the fields as this
646 * number should grow very slowly only */
647
de190aef
LP
648 s = DEFAULT_FIELD_HASH_TABLE_SIZE;
649 r = journal_file_append_object(f,
650 OBJECT_FIELD_HASH_TABLE,
651 offsetof(Object, hash_table.items) + s,
652 &o, &p);
cec736d2
LP
653 if (r < 0)
654 return r;
655
29804cc1 656 memzero(o->hash_table.items, s);
cec736d2 657
de190aef
LP
658 f->header->field_hash_table_offset = htole64(p + offsetof(Object, hash_table.items));
659 f->header->field_hash_table_size = htole64(s);
cec736d2
LP
660
661 return 0;
662}
663
dade37d4 664int journal_file_map_data_hash_table(JournalFile *f) {
cec736d2
LP
665 uint64_t s, p;
666 void *t;
667 int r;
668
669 assert(f);
670
dade37d4
LP
671 if (f->data_hash_table)
672 return 0;
673
de190aef
LP
674 p = le64toh(f->header->data_hash_table_offset);
675 s = le64toh(f->header->data_hash_table_size);
cec736d2 676
de190aef 677 r = journal_file_move_to(f,
16e9f408 678 OBJECT_DATA_HASH_TABLE,
fcde2389 679 true,
de190aef
LP
680 p, s,
681 &t);
cec736d2
LP
682 if (r < 0)
683 return r;
684
de190aef 685 f->data_hash_table = t;
cec736d2
LP
686 return 0;
687}
688
dade37d4 689int journal_file_map_field_hash_table(JournalFile *f) {
cec736d2
LP
690 uint64_t s, p;
691 void *t;
692 int r;
693
694 assert(f);
695
dade37d4
LP
696 if (f->field_hash_table)
697 return 0;
698
de190aef
LP
699 p = le64toh(f->header->field_hash_table_offset);
700 s = le64toh(f->header->field_hash_table_size);
cec736d2 701
de190aef 702 r = journal_file_move_to(f,
16e9f408 703 OBJECT_FIELD_HASH_TABLE,
fcde2389 704 true,
de190aef
LP
705 p, s,
706 &t);
cec736d2
LP
707 if (r < 0)
708 return r;
709
de190aef 710 f->field_hash_table = t;
cec736d2
LP
711 return 0;
712}
713
3c1668da
LP
714static int journal_file_link_field(
715 JournalFile *f,
716 Object *o,
717 uint64_t offset,
718 uint64_t hash) {
719
805d1486 720 uint64_t p, h, m;
3c1668da
LP
721 int r;
722
723 assert(f);
724 assert(o);
725 assert(offset > 0);
726
727 if (o->object.type != OBJECT_FIELD)
728 return -EINVAL;
729
805d1486
LP
730 m = le64toh(f->header->field_hash_table_size) / sizeof(HashItem);
731 if (m <= 0)
732 return -EBADMSG;
3c1668da 733
805d1486 734 /* This might alter the window we are looking at */
3c1668da
LP
735 o->field.next_hash_offset = o->field.head_data_offset = 0;
736
805d1486 737 h = hash % m;
3c1668da
LP
738 p = le64toh(f->field_hash_table[h].tail_hash_offset);
739 if (p == 0)
740 f->field_hash_table[h].head_hash_offset = htole64(offset);
741 else {
742 r = journal_file_move_to_object(f, OBJECT_FIELD, p, &o);
743 if (r < 0)
744 return r;
745
746 o->field.next_hash_offset = htole64(offset);
747 }
748
749 f->field_hash_table[h].tail_hash_offset = htole64(offset);
750
751 if (JOURNAL_HEADER_CONTAINS(f->header, n_fields))
752 f->header->n_fields = htole64(le64toh(f->header->n_fields) + 1);
753
754 return 0;
755}
756
757static int journal_file_link_data(
758 JournalFile *f,
759 Object *o,
760 uint64_t offset,
761 uint64_t hash) {
762
805d1486 763 uint64_t p, h, m;
cec736d2
LP
764 int r;
765
766 assert(f);
767 assert(o);
768 assert(offset > 0);
b588975f
LP
769
770 if (o->object.type != OBJECT_DATA)
771 return -EINVAL;
cec736d2 772
805d1486
LP
773 m = le64toh(f->header->data_hash_table_size) / sizeof(HashItem);
774 if (m <= 0)
775 return -EBADMSG;
48496df6 776
805d1486 777 /* This might alter the window we are looking at */
de190aef
LP
778 o->data.next_hash_offset = o->data.next_field_offset = 0;
779 o->data.entry_offset = o->data.entry_array_offset = 0;
780 o->data.n_entries = 0;
cec736d2 781
805d1486 782 h = hash % m;
8db4213e 783 p = le64toh(f->data_hash_table[h].tail_hash_offset);
3c1668da 784 if (p == 0)
cec736d2 785 /* Only entry in the hash table is easy */
de190aef 786 f->data_hash_table[h].head_hash_offset = htole64(offset);
3c1668da 787 else {
48496df6
LP
788 /* Move back to the previous data object, to patch in
789 * pointer */
cec736d2 790
de190aef 791 r = journal_file_move_to_object(f, OBJECT_DATA, p, &o);
cec736d2
LP
792 if (r < 0)
793 return r;
794
de190aef 795 o->data.next_hash_offset = htole64(offset);
cec736d2
LP
796 }
797
de190aef 798 f->data_hash_table[h].tail_hash_offset = htole64(offset);
cec736d2 799
dca6219e
LP
800 if (JOURNAL_HEADER_CONTAINS(f->header, n_data))
801 f->header->n_data = htole64(le64toh(f->header->n_data) + 1);
802
cec736d2
LP
803 return 0;
804}
805
3c1668da
LP
806int journal_file_find_field_object_with_hash(
807 JournalFile *f,
808 const void *field, uint64_t size, uint64_t hash,
809 Object **ret, uint64_t *offset) {
810
805d1486 811 uint64_t p, osize, h, m;
3c1668da
LP
812 int r;
813
814 assert(f);
815 assert(field && size > 0);
816
dade37d4
LP
817 /* If the field hash table is empty, we can't find anything */
818 if (le64toh(f->header->field_hash_table_size) <= 0)
819 return 0;
820
821 /* Map the field hash table, if it isn't mapped yet. */
822 r = journal_file_map_field_hash_table(f);
823 if (r < 0)
824 return r;
825
3c1668da
LP
826 osize = offsetof(Object, field.payload) + size;
827
805d1486 828 m = le64toh(f->header->field_hash_table_size) / sizeof(HashItem);
805d1486 829 if (m <= 0)
3c1668da
LP
830 return -EBADMSG;
831
805d1486 832 h = hash % m;
3c1668da
LP
833 p = le64toh(f->field_hash_table[h].head_hash_offset);
834
835 while (p > 0) {
836 Object *o;
837
838 r = journal_file_move_to_object(f, OBJECT_FIELD, p, &o);
839 if (r < 0)
840 return r;
841
842 if (le64toh(o->field.hash) == hash &&
843 le64toh(o->object.size) == osize &&
844 memcmp(o->field.payload, field, size) == 0) {
845
846 if (ret)
847 *ret = o;
848 if (offset)
849 *offset = p;
850
851 return 1;
852 }
853
854 p = le64toh(o->field.next_hash_offset);
855 }
856
857 return 0;
858}
859
860int journal_file_find_field_object(
861 JournalFile *f,
862 const void *field, uint64_t size,
863 Object **ret, uint64_t *offset) {
864
865 uint64_t hash;
866
867 assert(f);
868 assert(field && size > 0);
869
870 hash = hash64(field, size);
871
872 return journal_file_find_field_object_with_hash(f,
873 field, size, hash,
874 ret, offset);
875}
876
de190aef
LP
877int journal_file_find_data_object_with_hash(
878 JournalFile *f,
879 const void *data, uint64_t size, uint64_t hash,
880 Object **ret, uint64_t *offset) {
48496df6 881
805d1486 882 uint64_t p, osize, h, m;
cec736d2
LP
883 int r;
884
885 assert(f);
886 assert(data || size == 0);
887
dade37d4
LP
888 /* If there's no data hash table, then there's no entry. */
889 if (le64toh(f->header->data_hash_table_size) <= 0)
890 return 0;
891
892 /* Map the data hash table, if it isn't mapped yet. */
893 r = journal_file_map_data_hash_table(f);
894 if (r < 0)
895 return r;
896
cec736d2
LP
897 osize = offsetof(Object, data.payload) + size;
898
805d1486
LP
899 m = le64toh(f->header->data_hash_table_size) / sizeof(HashItem);
900 if (m <= 0)
bc85bfee
LP
901 return -EBADMSG;
902
805d1486 903 h = hash % m;
de190aef 904 p = le64toh(f->data_hash_table[h].head_hash_offset);
cec736d2 905
de190aef
LP
906 while (p > 0) {
907 Object *o;
cec736d2 908
de190aef 909 r = journal_file_move_to_object(f, OBJECT_DATA, p, &o);
cec736d2
LP
910 if (r < 0)
911 return r;
912
807e17f0 913 if (le64toh(o->data.hash) != hash)
85a131e8 914 goto next;
807e17f0 915
d89c8fdf 916 if (o->object.flags & OBJECT_COMPRESSION_MASK) {
3b1a55e1 917#if defined(HAVE_XZ) || defined(HAVE_LZ4)
fa1c4b51 918 uint64_t l;
a7f7d1bd 919 size_t rsize = 0;
cec736d2 920
807e17f0
LP
921 l = le64toh(o->object.size);
922 if (l <= offsetof(Object, data.payload))
cec736d2
LP
923 return -EBADMSG;
924
807e17f0
LP
925 l -= offsetof(Object, data.payload);
926
d89c8fdf
ZJS
927 r = decompress_blob(o->object.flags & OBJECT_COMPRESSION_MASK,
928 o->data.payload, l, &f->compress_buffer, &f->compress_buffer_size, &rsize, 0);
929 if (r < 0)
930 return r;
807e17f0 931
b785c858 932 if (rsize == size &&
807e17f0
LP
933 memcmp(f->compress_buffer, data, size) == 0) {
934
935 if (ret)
936 *ret = o;
937
938 if (offset)
939 *offset = p;
940
941 return 1;
942 }
3b1a55e1
ZJS
943#else
944 return -EPROTONOSUPPORT;
945#endif
807e17f0
LP
946 } else if (le64toh(o->object.size) == osize &&
947 memcmp(o->data.payload, data, size) == 0) {
948
cec736d2
LP
949 if (ret)
950 *ret = o;
951
952 if (offset)
953 *offset = p;
954
de190aef 955 return 1;
cec736d2
LP
956 }
957
85a131e8 958 next:
cec736d2
LP
959 p = le64toh(o->data.next_hash_offset);
960 }
961
de190aef
LP
962 return 0;
963}
964
965int journal_file_find_data_object(
966 JournalFile *f,
967 const void *data, uint64_t size,
968 Object **ret, uint64_t *offset) {
969
970 uint64_t hash;
971
972 assert(f);
973 assert(data || size == 0);
974
975 hash = hash64(data, size);
976
977 return journal_file_find_data_object_with_hash(f,
978 data, size, hash,
979 ret, offset);
980}
981
3c1668da
LP
982static int journal_file_append_field(
983 JournalFile *f,
984 const void *field, uint64_t size,
985 Object **ret, uint64_t *offset) {
986
987 uint64_t hash, p;
988 uint64_t osize;
989 Object *o;
990 int r;
991
992 assert(f);
993 assert(field && size > 0);
994
995 hash = hash64(field, size);
996
997 r = journal_file_find_field_object_with_hash(f, field, size, hash, &o, &p);
998 if (r < 0)
999 return r;
1000 else if (r > 0) {
1001
1002 if (ret)
1003 *ret = o;
1004
1005 if (offset)
1006 *offset = p;
1007
1008 return 0;
1009 }
1010
1011 osize = offsetof(Object, field.payload) + size;
1012 r = journal_file_append_object(f, OBJECT_FIELD, osize, &o, &p);
8c92d4bb
LP
1013 if (r < 0)
1014 return r;
3c1668da
LP
1015
1016 o->field.hash = htole64(hash);
1017 memcpy(o->field.payload, field, size);
1018
1019 r = journal_file_link_field(f, o, p, hash);
1020 if (r < 0)
1021 return r;
1022
1023 /* The linking might have altered the window, so let's
1024 * refresh our pointer */
1025 r = journal_file_move_to_object(f, OBJECT_FIELD, p, &o);
1026 if (r < 0)
1027 return r;
1028
1029#ifdef HAVE_GCRYPT
1030 r = journal_file_hmac_put_object(f, OBJECT_FIELD, o, p);
1031 if (r < 0)
1032 return r;
1033#endif
1034
1035 if (ret)
1036 *ret = o;
1037
1038 if (offset)
1039 *offset = p;
1040
1041 return 0;
1042}
1043
48496df6
LP
1044static int journal_file_append_data(
1045 JournalFile *f,
1046 const void *data, uint64_t size,
1047 Object **ret, uint64_t *offset) {
1048
de190aef
LP
1049 uint64_t hash, p;
1050 uint64_t osize;
1051 Object *o;
d89c8fdf 1052 int r, compression = 0;
3c1668da 1053 const void *eq;
de190aef
LP
1054
1055 assert(f);
1056 assert(data || size == 0);
1057
1058 hash = hash64(data, size);
1059
1060 r = journal_file_find_data_object_with_hash(f, data, size, hash, &o, &p);
1061 if (r < 0)
1062 return r;
0240c603 1063 if (r > 0) {
de190aef
LP
1064
1065 if (ret)
1066 *ret = o;
1067
1068 if (offset)
1069 *offset = p;
1070
1071 return 0;
1072 }
1073
1074 osize = offsetof(Object, data.payload) + size;
1075 r = journal_file_append_object(f, OBJECT_DATA, osize, &o, &p);
cec736d2
LP
1076 if (r < 0)
1077 return r;
1078
cec736d2 1079 o->data.hash = htole64(hash);
807e17f0 1080
d89c8fdf 1081#if defined(HAVE_XZ) || defined(HAVE_LZ4)
d1afbcd2 1082 if (JOURNAL_FILE_COMPRESS(f) && size >= COMPRESSION_SIZE_THRESHOLD) {
a7f7d1bd 1083 size_t rsize = 0;
807e17f0 1084
d89c8fdf 1085 compression = compress_blob(data, size, o->data.payload, &rsize);
807e17f0 1086
d1afbcd2 1087 if (compression >= 0) {
807e17f0 1088 o->object.size = htole64(offsetof(Object, data.payload) + rsize);
d89c8fdf 1089 o->object.flags |= compression;
807e17f0 1090
fa1c4b51 1091 log_debug("Compressed data object %"PRIu64" -> %zu using %s",
d89c8fdf 1092 size, rsize, object_compressed_to_string(compression));
d1afbcd2
LP
1093 } else
1094 /* Compression didn't work, we don't really care why, let's continue without compression */
1095 compression = 0;
807e17f0
LP
1096 }
1097#endif
1098
d1afbcd2 1099 if (compression == 0 && size > 0)
807e17f0 1100 memcpy(o->data.payload, data, size);
cec736d2 1101
de190aef 1102 r = journal_file_link_data(f, o, p, hash);
cec736d2
LP
1103 if (r < 0)
1104 return r;
1105
48496df6
LP
1106 /* The linking might have altered the window, so let's
1107 * refresh our pointer */
1108 r = journal_file_move_to_object(f, OBJECT_DATA, p, &o);
1109 if (r < 0)
1110 return r;
1111
08c6f819
SL
1112 if (!data)
1113 eq = NULL;
1114 else
1115 eq = memchr(data, '=', size);
3c1668da 1116 if (eq && eq > data) {
748db592 1117 Object *fo = NULL;
3c1668da 1118 uint64_t fp;
3c1668da
LP
1119
1120 /* Create field object ... */
1121 r = journal_file_append_field(f, data, (uint8_t*) eq - (uint8_t*) data, &fo, &fp);
1122 if (r < 0)
1123 return r;
1124
1125 /* ... and link it in. */
1126 o->data.next_field_offset = fo->field.head_data_offset;
1127 fo->field.head_data_offset = le64toh(p);
1128 }
1129
5996c7c2
LP
1130#ifdef HAVE_GCRYPT
1131 r = journal_file_hmac_put_object(f, OBJECT_DATA, o, p);
1132 if (r < 0)
1133 return r;
1134#endif
1135
cec736d2
LP
1136 if (ret)
1137 *ret = o;
1138
1139 if (offset)
de190aef 1140 *offset = p;
cec736d2
LP
1141
1142 return 0;
1143}
1144
1145uint64_t journal_file_entry_n_items(Object *o) {
1146 assert(o);
b588975f
LP
1147
1148 if (o->object.type != OBJECT_ENTRY)
1149 return 0;
cec736d2
LP
1150
1151 return (le64toh(o->object.size) - offsetof(Object, entry.items)) / sizeof(EntryItem);
1152}
1153
0284adc6 1154uint64_t journal_file_entry_array_n_items(Object *o) {
de190aef 1155 assert(o);
b588975f
LP
1156
1157 if (o->object.type != OBJECT_ENTRY_ARRAY)
1158 return 0;
de190aef
LP
1159
1160 return (le64toh(o->object.size) - offsetof(Object, entry_array.items)) / sizeof(uint64_t);
1161}
1162
fb9a24b6
LP
1163uint64_t journal_file_hash_table_n_items(Object *o) {
1164 assert(o);
b588975f
LP
1165
1166 if (o->object.type != OBJECT_DATA_HASH_TABLE &&
1167 o->object.type != OBJECT_FIELD_HASH_TABLE)
1168 return 0;
fb9a24b6
LP
1169
1170 return (le64toh(o->object.size) - offsetof(Object, hash_table.items)) / sizeof(HashItem);
1171}
1172
de190aef 1173static int link_entry_into_array(JournalFile *f,
4fd052ae
FC
1174 le64_t *first,
1175 le64_t *idx,
de190aef 1176 uint64_t p) {
cec736d2 1177 int r;
de190aef
LP
1178 uint64_t n = 0, ap = 0, q, i, a, hidx;
1179 Object *o;
1180
cec736d2 1181 assert(f);
de190aef
LP
1182 assert(first);
1183 assert(idx);
1184 assert(p > 0);
cec736d2 1185
de190aef
LP
1186 a = le64toh(*first);
1187 i = hidx = le64toh(*idx);
1188 while (a > 0) {
1189
1190 r = journal_file_move_to_object(f, OBJECT_ENTRY_ARRAY, a, &o);
1191 if (r < 0)
1192 return r;
cec736d2 1193
de190aef
LP
1194 n = journal_file_entry_array_n_items(o);
1195 if (i < n) {
1196 o->entry_array.items[i] = htole64(p);
1197 *idx = htole64(hidx + 1);
1198 return 0;
1199 }
cec736d2 1200
de190aef
LP
1201 i -= n;
1202 ap = a;
1203 a = le64toh(o->entry_array.next_entry_array_offset);
1204 }
1205
1206 if (hidx > n)
1207 n = (hidx+1) * 2;
1208 else
1209 n = n * 2;
1210
1211 if (n < 4)
1212 n = 4;
1213
1214 r = journal_file_append_object(f, OBJECT_ENTRY_ARRAY,
1215 offsetof(Object, entry_array.items) + n * sizeof(uint64_t),
1216 &o, &q);
cec736d2
LP
1217 if (r < 0)
1218 return r;
1219
feb12d3e 1220#ifdef HAVE_GCRYPT
5996c7c2 1221 r = journal_file_hmac_put_object(f, OBJECT_ENTRY_ARRAY, o, q);
b0af6f41
LP
1222 if (r < 0)
1223 return r;
feb12d3e 1224#endif
b0af6f41 1225
de190aef 1226 o->entry_array.items[i] = htole64(p);
cec736d2 1227
de190aef 1228 if (ap == 0)
7be3aa17 1229 *first = htole64(q);
cec736d2 1230 else {
de190aef 1231 r = journal_file_move_to_object(f, OBJECT_ENTRY_ARRAY, ap, &o);
cec736d2
LP
1232 if (r < 0)
1233 return r;
1234
de190aef
LP
1235 o->entry_array.next_entry_array_offset = htole64(q);
1236 }
cec736d2 1237
2dee23eb
LP
1238 if (JOURNAL_HEADER_CONTAINS(f->header, n_entry_arrays))
1239 f->header->n_entry_arrays = htole64(le64toh(f->header->n_entry_arrays) + 1);
1240
de190aef
LP
1241 *idx = htole64(hidx + 1);
1242
1243 return 0;
1244}
cec736d2 1245
de190aef 1246static int link_entry_into_array_plus_one(JournalFile *f,
4fd052ae
FC
1247 le64_t *extra,
1248 le64_t *first,
1249 le64_t *idx,
de190aef
LP
1250 uint64_t p) {
1251
1252 int r;
1253
1254 assert(f);
1255 assert(extra);
1256 assert(first);
1257 assert(idx);
1258 assert(p > 0);
1259
1260 if (*idx == 0)
1261 *extra = htole64(p);
1262 else {
4fd052ae 1263 le64_t i;
de190aef 1264
7be3aa17 1265 i = htole64(le64toh(*idx) - 1);
de190aef
LP
1266 r = link_entry_into_array(f, first, &i, p);
1267 if (r < 0)
1268 return r;
cec736d2
LP
1269 }
1270
de190aef
LP
1271 *idx = htole64(le64toh(*idx) + 1);
1272 return 0;
1273}
1274
1275static int journal_file_link_entry_item(JournalFile *f, Object *o, uint64_t offset, uint64_t i) {
1276 uint64_t p;
1277 int r;
1278 assert(f);
1279 assert(o);
1280 assert(offset > 0);
1281
1282 p = le64toh(o->entry.items[i].object_offset);
1283 if (p == 0)
1284 return -EINVAL;
1285
1286 r = journal_file_move_to_object(f, OBJECT_DATA, p, &o);
cec736d2
LP
1287 if (r < 0)
1288 return r;
1289
de190aef
LP
1290 return link_entry_into_array_plus_one(f,
1291 &o->data.entry_offset,
1292 &o->data.entry_array_offset,
1293 &o->data.n_entries,
1294 offset);
cec736d2
LP
1295}
1296
1297static int journal_file_link_entry(JournalFile *f, Object *o, uint64_t offset) {
de190aef 1298 uint64_t n, i;
cec736d2
LP
1299 int r;
1300
1301 assert(f);
1302 assert(o);
1303 assert(offset > 0);
b588975f
LP
1304
1305 if (o->object.type != OBJECT_ENTRY)
1306 return -EINVAL;
cec736d2 1307
b788cc23
LP
1308 __sync_synchronize();
1309
cec736d2 1310 /* Link up the entry itself */
de190aef
LP
1311 r = link_entry_into_array(f,
1312 &f->header->entry_array_offset,
1313 &f->header->n_entries,
1314 offset);
1315 if (r < 0)
1316 return r;
cec736d2 1317
507f22bd 1318 /* log_debug("=> %s seqnr=%"PRIu64" n_entries=%"PRIu64, f->path, o->entry.seqnum, f->header->n_entries); */
cec736d2 1319
de190aef 1320 if (f->header->head_entry_realtime == 0)
0ac38b70 1321 f->header->head_entry_realtime = o->entry.realtime;
cec736d2 1322
0ac38b70 1323 f->header->tail_entry_realtime = o->entry.realtime;
de190aef
LP
1324 f->header->tail_entry_monotonic = o->entry.monotonic;
1325
1326 f->tail_entry_monotonic_valid = true;
cec736d2
LP
1327
1328 /* Link up the items */
1329 n = journal_file_entry_n_items(o);
1330 for (i = 0; i < n; i++) {
1331 r = journal_file_link_entry_item(f, o, offset, i);
1332 if (r < 0)
1333 return r;
1334 }
1335
cec736d2
LP
1336 return 0;
1337}
1338
1339static int journal_file_append_entry_internal(
1340 JournalFile *f,
1341 const dual_timestamp *ts,
1342 uint64_t xor_hash,
1343 const EntryItem items[], unsigned n_items,
de190aef 1344 uint64_t *seqnum,
cec736d2
LP
1345 Object **ret, uint64_t *offset) {
1346 uint64_t np;
1347 uint64_t osize;
1348 Object *o;
1349 int r;
1350
1351 assert(f);
1352 assert(items || n_items == 0);
de190aef 1353 assert(ts);
cec736d2
LP
1354
1355 osize = offsetof(Object, entry.items) + (n_items * sizeof(EntryItem));
1356
de190aef 1357 r = journal_file_append_object(f, OBJECT_ENTRY, osize, &o, &np);
cec736d2
LP
1358 if (r < 0)
1359 return r;
1360
d98cc1f2 1361 o->entry.seqnum = htole64(journal_file_entry_seqnum(f, seqnum));
cec736d2 1362 memcpy(o->entry.items, items, n_items * sizeof(EntryItem));
de190aef
LP
1363 o->entry.realtime = htole64(ts->realtime);
1364 o->entry.monotonic = htole64(ts->monotonic);
cec736d2
LP
1365 o->entry.xor_hash = htole64(xor_hash);
1366 o->entry.boot_id = f->header->boot_id;
1367
feb12d3e 1368#ifdef HAVE_GCRYPT
5996c7c2 1369 r = journal_file_hmac_put_object(f, OBJECT_ENTRY, o, np);
b0af6f41
LP
1370 if (r < 0)
1371 return r;
feb12d3e 1372#endif
b0af6f41 1373
cec736d2
LP
1374 r = journal_file_link_entry(f, o, np);
1375 if (r < 0)
1376 return r;
1377
1378 if (ret)
1379 *ret = o;
1380
1381 if (offset)
1382 *offset = np;
1383
1384 return 0;
1385}
1386
cf244689 1387void journal_file_post_change(JournalFile *f) {
50f20cfd
LP
1388 assert(f);
1389
1390 /* inotify() does not receive IN_MODIFY events from file
1391 * accesses done via mmap(). After each access we hence
1392 * trigger IN_MODIFY by truncating the journal file to its
1393 * current size which triggers IN_MODIFY. */
1394
bc85bfee
LP
1395 __sync_synchronize();
1396
50f20cfd 1397 if (ftruncate(f->fd, f->last_stat.st_size) < 0)
56f64d95 1398 log_error_errno(errno, "Failed to truncate file to its own size: %m");
50f20cfd
LP
1399}
1400
1f2da9ec
LP
1401static int entry_item_cmp(const void *_a, const void *_b) {
1402 const EntryItem *a = _a, *b = _b;
1403
1404 if (le64toh(a->object_offset) < le64toh(b->object_offset))
1405 return -1;
1406 if (le64toh(a->object_offset) > le64toh(b->object_offset))
1407 return 1;
1408 return 0;
1409}
1410
de190aef 1411int journal_file_append_entry(JournalFile *f, const dual_timestamp *ts, const struct iovec iovec[], unsigned n_iovec, uint64_t *seqnum, Object **ret, uint64_t *offset) {
cec736d2
LP
1412 unsigned i;
1413 EntryItem *items;
1414 int r;
1415 uint64_t xor_hash = 0;
de190aef 1416 struct dual_timestamp _ts;
cec736d2
LP
1417
1418 assert(f);
1419 assert(iovec || n_iovec == 0);
1420
de190aef
LP
1421 if (!ts) {
1422 dual_timestamp_get(&_ts);
1423 ts = &_ts;
1424 }
1425
1426 if (f->tail_entry_monotonic_valid &&
1427 ts->monotonic < le64toh(f->header->tail_entry_monotonic))
1428 return -EINVAL;
1429
feb12d3e 1430#ifdef HAVE_GCRYPT
7560fffc
LP
1431 r = journal_file_maybe_append_tag(f, ts->realtime);
1432 if (r < 0)
1433 return r;
feb12d3e 1434#endif
7560fffc 1435
64825d3c 1436 /* alloca() can't take 0, hence let's allocate at least one */
9607d947 1437 items = alloca(sizeof(EntryItem) * MAX(1u, n_iovec));
cec736d2
LP
1438
1439 for (i = 0; i < n_iovec; i++) {
1440 uint64_t p;
1441 Object *o;
1442
1443 r = journal_file_append_data(f, iovec[i].iov_base, iovec[i].iov_len, &o, &p);
1444 if (r < 0)
cf244689 1445 return r;
cec736d2
LP
1446
1447 xor_hash ^= le64toh(o->data.hash);
1448 items[i].object_offset = htole64(p);
de7b95cd 1449 items[i].hash = o->data.hash;
cec736d2
LP
1450 }
1451
1f2da9ec
LP
1452 /* Order by the position on disk, in order to improve seek
1453 * times for rotating media. */
7ff7394d 1454 qsort_safe(items, n_iovec, sizeof(EntryItem), entry_item_cmp);
1f2da9ec 1455
de190aef 1456 r = journal_file_append_entry_internal(f, ts, xor_hash, items, n_iovec, seqnum, ret, offset);
cec736d2 1457
fa6ac760
LP
1458 /* If the memory mapping triggered a SIGBUS then we return an
1459 * IO error and ignore the error code passed down to us, since
1460 * it is very likely just an effect of a nullified replacement
1461 * mapping page */
1462
1463 if (mmap_cache_got_sigbus(f->mmap, f->fd))
1464 r = -EIO;
1465
50f20cfd
LP
1466 journal_file_post_change(f);
1467
cec736d2
LP
1468 return r;
1469}
1470
a4bcff5b 1471typedef struct ChainCacheItem {
fb099c8d 1472 uint64_t first; /* the array at the beginning of the chain */
a4bcff5b
LP
1473 uint64_t array; /* the cached array */
1474 uint64_t begin; /* the first item in the cached array */
1475 uint64_t total; /* the total number of items in all arrays before this one in the chain */
f268980d 1476 uint64_t last_index; /* the last index we looked at, to optimize locality when bisecting */
a4bcff5b
LP
1477} ChainCacheItem;
1478
1479static void chain_cache_put(
4743015d 1480 OrderedHashmap *h,
a4bcff5b
LP
1481 ChainCacheItem *ci,
1482 uint64_t first,
1483 uint64_t array,
1484 uint64_t begin,
f268980d
LP
1485 uint64_t total,
1486 uint64_t last_index) {
a4bcff5b
LP
1487
1488 if (!ci) {
34741aa3
LP
1489 /* If the chain item to cache for this chain is the
1490 * first one it's not worth caching anything */
1491 if (array == first)
1492 return;
1493
29433089 1494 if (ordered_hashmap_size(h) >= CHAIN_CACHE_MAX) {
4743015d 1495 ci = ordered_hashmap_steal_first(h);
29433089
LP
1496 assert(ci);
1497 } else {
a4bcff5b
LP
1498 ci = new(ChainCacheItem, 1);
1499 if (!ci)
1500 return;
1501 }
1502
1503 ci->first = first;
1504
4743015d 1505 if (ordered_hashmap_put(h, &ci->first, ci) < 0) {
a4bcff5b
LP
1506 free(ci);
1507 return;
1508 }
1509 } else
1510 assert(ci->first == first);
1511
1512 ci->array = array;
1513 ci->begin = begin;
1514 ci->total = total;
f268980d 1515 ci->last_index = last_index;
a4bcff5b
LP
1516}
1517
f268980d
LP
1518static int generic_array_get(
1519 JournalFile *f,
1520 uint64_t first,
1521 uint64_t i,
1522 Object **ret, uint64_t *offset) {
de190aef 1523
cec736d2 1524 Object *o;
a4bcff5b 1525 uint64_t p = 0, a, t = 0;
cec736d2 1526 int r;
a4bcff5b 1527 ChainCacheItem *ci;
cec736d2
LP
1528
1529 assert(f);
1530
de190aef 1531 a = first;
a4bcff5b
LP
1532
1533 /* Try the chain cache first */
4743015d 1534 ci = ordered_hashmap_get(f->chain_cache, &first);
a4bcff5b
LP
1535 if (ci && i > ci->total) {
1536 a = ci->array;
1537 i -= ci->total;
1538 t = ci->total;
1539 }
1540
de190aef 1541 while (a > 0) {
a4bcff5b 1542 uint64_t k;
cec736d2 1543
de190aef
LP
1544 r = journal_file_move_to_object(f, OBJECT_ENTRY_ARRAY, a, &o);
1545 if (r < 0)
1546 return r;
cec736d2 1547
a4bcff5b
LP
1548 k = journal_file_entry_array_n_items(o);
1549 if (i < k) {
de190aef 1550 p = le64toh(o->entry_array.items[i]);
a4bcff5b 1551 goto found;
cec736d2
LP
1552 }
1553
a4bcff5b
LP
1554 i -= k;
1555 t += k;
de190aef
LP
1556 a = le64toh(o->entry_array.next_entry_array_offset);
1557 }
1558
a4bcff5b
LP
1559 return 0;
1560
1561found:
1562 /* Let's cache this item for the next invocation */
af13a6b0 1563 chain_cache_put(f->chain_cache, ci, first, a, le64toh(o->entry_array.items[0]), t, i);
de190aef
LP
1564
1565 r = journal_file_move_to_object(f, OBJECT_ENTRY, p, &o);
1566 if (r < 0)
1567 return r;
1568
1569 if (ret)
1570 *ret = o;
1571
1572 if (offset)
1573 *offset = p;
1574
1575 return 1;
1576}
1577
f268980d
LP
1578static int generic_array_get_plus_one(
1579 JournalFile *f,
1580 uint64_t extra,
1581 uint64_t first,
1582 uint64_t i,
1583 Object **ret, uint64_t *offset) {
de190aef
LP
1584
1585 Object *o;
1586
1587 assert(f);
1588
1589 if (i == 0) {
1590 int r;
1591
1592 r = journal_file_move_to_object(f, OBJECT_ENTRY, extra, &o);
cec736d2
LP
1593 if (r < 0)
1594 return r;
1595
de190aef
LP
1596 if (ret)
1597 *ret = o;
cec736d2 1598
de190aef
LP
1599 if (offset)
1600 *offset = extra;
cec736d2 1601
de190aef 1602 return 1;
cec736d2
LP
1603 }
1604
de190aef
LP
1605 return generic_array_get(f, first, i-1, ret, offset);
1606}
cec736d2 1607
de190aef
LP
1608enum {
1609 TEST_FOUND,
1610 TEST_LEFT,
1611 TEST_RIGHT
1612};
cec736d2 1613
f268980d
LP
1614static int generic_array_bisect(
1615 JournalFile *f,
1616 uint64_t first,
1617 uint64_t n,
1618 uint64_t needle,
1619 int (*test_object)(JournalFile *f, uint64_t p, uint64_t needle),
1620 direction_t direction,
1621 Object **ret,
1622 uint64_t *offset,
1623 uint64_t *idx) {
1624
1625 uint64_t a, p, t = 0, i = 0, last_p = 0, last_index = (uint64_t) -1;
de190aef
LP
1626 bool subtract_one = false;
1627 Object *o, *array = NULL;
1628 int r;
a4bcff5b 1629 ChainCacheItem *ci;
cec736d2 1630
de190aef
LP
1631 assert(f);
1632 assert(test_object);
cec736d2 1633
a4bcff5b 1634 /* Start with the first array in the chain */
de190aef 1635 a = first;
a4bcff5b 1636
4743015d 1637 ci = ordered_hashmap_get(f->chain_cache, &first);
a4bcff5b
LP
1638 if (ci && n > ci->total) {
1639 /* Ah, we have iterated this bisection array chain
1640 * previously! Let's see if we can skip ahead in the
1641 * chain, as far as the last time. But we can't jump
1642 * backwards in the chain, so let's check that
1643 * first. */
1644
1645 r = test_object(f, ci->begin, needle);
1646 if (r < 0)
1647 return r;
1648
1649 if (r == TEST_LEFT) {
f268980d 1650 /* OK, what we are looking for is right of the
a4bcff5b
LP
1651 * begin of this EntryArray, so let's jump
1652 * straight to previously cached array in the
1653 * chain */
1654
1655 a = ci->array;
1656 n -= ci->total;
1657 t = ci->total;
f268980d 1658 last_index = ci->last_index;
a4bcff5b
LP
1659 }
1660 }
1661
de190aef
LP
1662 while (a > 0) {
1663 uint64_t left, right, k, lp;
1664
1665 r = journal_file_move_to_object(f, OBJECT_ENTRY_ARRAY, a, &array);
cec736d2
LP
1666 if (r < 0)
1667 return r;
1668
de190aef
LP
1669 k = journal_file_entry_array_n_items(array);
1670 right = MIN(k, n);
1671 if (right <= 0)
1672 return 0;
cec736d2 1673
de190aef
LP
1674 i = right - 1;
1675 lp = p = le64toh(array->entry_array.items[i]);
1676 if (p <= 0)
1677 return -EBADMSG;
cec736d2 1678
de190aef
LP
1679 r = test_object(f, p, needle);
1680 if (r < 0)
1681 return r;
cec736d2 1682
de190aef
LP
1683 if (r == TEST_FOUND)
1684 r = direction == DIRECTION_DOWN ? TEST_RIGHT : TEST_LEFT;
1685
1686 if (r == TEST_RIGHT) {
1687 left = 0;
1688 right -= 1;
f268980d
LP
1689
1690 if (last_index != (uint64_t) -1) {
1691 assert(last_index <= right);
1692
1693 /* If we cached the last index we
1694 * looked at, let's try to not to jump
1695 * too wildly around and see if we can
1696 * limit the range to look at early to
1697 * the immediate neighbors of the last
1698 * index we looked at. */
1699
1700 if (last_index > 0) {
1701 uint64_t x = last_index - 1;
1702
1703 p = le64toh(array->entry_array.items[x]);
1704 if (p <= 0)
1705 return -EBADMSG;
1706
1707 r = test_object(f, p, needle);
1708 if (r < 0)
1709 return r;
1710
1711 if (r == TEST_FOUND)
1712 r = direction == DIRECTION_DOWN ? TEST_RIGHT : TEST_LEFT;
1713
1714 if (r == TEST_RIGHT)
1715 right = x;
1716 else
1717 left = x + 1;
1718 }
1719
1720 if (last_index < right) {
1721 uint64_t y = last_index + 1;
1722
1723 p = le64toh(array->entry_array.items[y]);
1724 if (p <= 0)
1725 return -EBADMSG;
1726
1727 r = test_object(f, p, needle);
1728 if (r < 0)
1729 return r;
1730
1731 if (r == TEST_FOUND)
1732 r = direction == DIRECTION_DOWN ? TEST_RIGHT : TEST_LEFT;
1733
1734 if (r == TEST_RIGHT)
1735 right = y;
1736 else
1737 left = y + 1;
1738 }
f268980d
LP
1739 }
1740
de190aef
LP
1741 for (;;) {
1742 if (left == right) {
1743 if (direction == DIRECTION_UP)
1744 subtract_one = true;
1745
1746 i = left;
1747 goto found;
1748 }
1749
1750 assert(left < right);
de190aef 1751 i = (left + right) / 2;
f268980d 1752
de190aef
LP
1753 p = le64toh(array->entry_array.items[i]);
1754 if (p <= 0)
1755 return -EBADMSG;
1756
1757 r = test_object(f, p, needle);
1758 if (r < 0)
1759 return r;
cec736d2 1760
de190aef
LP
1761 if (r == TEST_FOUND)
1762 r = direction == DIRECTION_DOWN ? TEST_RIGHT : TEST_LEFT;
1763
1764 if (r == TEST_RIGHT)
1765 right = i;
1766 else
1767 left = i + 1;
1768 }
1769 }
1770
2173cbf8 1771 if (k >= n) {
cbdca852
LP
1772 if (direction == DIRECTION_UP) {
1773 i = n;
1774 subtract_one = true;
1775 goto found;
1776 }
1777
cec736d2 1778 return 0;
cbdca852 1779 }
cec736d2 1780
de190aef
LP
1781 last_p = lp;
1782
1783 n -= k;
1784 t += k;
f268980d 1785 last_index = (uint64_t) -1;
de190aef 1786 a = le64toh(array->entry_array.next_entry_array_offset);
cec736d2
LP
1787 }
1788
1789 return 0;
de190aef
LP
1790
1791found:
1792 if (subtract_one && t == 0 && i == 0)
1793 return 0;
1794
a4bcff5b 1795 /* Let's cache this item for the next invocation */
af13a6b0 1796 chain_cache_put(f->chain_cache, ci, first, a, le64toh(array->entry_array.items[0]), t, subtract_one ? (i > 0 ? i-1 : (uint64_t) -1) : i);
a4bcff5b 1797
de190aef
LP
1798 if (subtract_one && i == 0)
1799 p = last_p;
1800 else if (subtract_one)
1801 p = le64toh(array->entry_array.items[i-1]);
1802 else
1803 p = le64toh(array->entry_array.items[i]);
1804
1805 r = journal_file_move_to_object(f, OBJECT_ENTRY, p, &o);
1806 if (r < 0)
1807 return r;
1808
1809 if (ret)
1810 *ret = o;
1811
1812 if (offset)
1813 *offset = p;
1814
1815 if (idx)
cbdca852 1816 *idx = t + i + (subtract_one ? -1 : 0);
de190aef
LP
1817
1818 return 1;
cec736d2
LP
1819}
1820
f268980d
LP
1821static int generic_array_bisect_plus_one(
1822 JournalFile *f,
1823 uint64_t extra,
1824 uint64_t first,
1825 uint64_t n,
1826 uint64_t needle,
1827 int (*test_object)(JournalFile *f, uint64_t p, uint64_t needle),
1828 direction_t direction,
1829 Object **ret,
1830 uint64_t *offset,
1831 uint64_t *idx) {
de190aef 1832
cec736d2 1833 int r;
cbdca852
LP
1834 bool step_back = false;
1835 Object *o;
cec736d2
LP
1836
1837 assert(f);
de190aef 1838 assert(test_object);
cec736d2 1839
de190aef
LP
1840 if (n <= 0)
1841 return 0;
cec736d2 1842
de190aef
LP
1843 /* This bisects the array in object 'first', but first checks
1844 * an extra */
de190aef
LP
1845 r = test_object(f, extra, needle);
1846 if (r < 0)
1847 return r;
a536e261
LP
1848
1849 if (r == TEST_FOUND)
1850 r = direction == DIRECTION_DOWN ? TEST_RIGHT : TEST_LEFT;
1851
cbdca852
LP
1852 /* if we are looking with DIRECTION_UP then we need to first
1853 see if in the actual array there is a matching entry, and
1854 return the last one of that. But if there isn't any we need
1855 to return this one. Hence remember this, and return it
1856 below. */
1857 if (r == TEST_LEFT)
1858 step_back = direction == DIRECTION_UP;
de190aef 1859
cbdca852
LP
1860 if (r == TEST_RIGHT) {
1861 if (direction == DIRECTION_DOWN)
1862 goto found;
1863 else
1864 return 0;
a536e261 1865 }
cec736d2 1866
de190aef
LP
1867 r = generic_array_bisect(f, first, n-1, needle, test_object, direction, ret, offset, idx);
1868
cbdca852
LP
1869 if (r == 0 && step_back)
1870 goto found;
1871
ecf68b1d 1872 if (r > 0 && idx)
de190aef
LP
1873 (*idx) ++;
1874
1875 return r;
cbdca852
LP
1876
1877found:
1878 r = journal_file_move_to_object(f, OBJECT_ENTRY, extra, &o);
1879 if (r < 0)
1880 return r;
1881
1882 if (ret)
1883 *ret = o;
1884
1885 if (offset)
1886 *offset = extra;
1887
1888 if (idx)
1889 *idx = 0;
1890
1891 return 1;
1892}
1893
44a6b1b6 1894_pure_ static int test_object_offset(JournalFile *f, uint64_t p, uint64_t needle) {
cbdca852
LP
1895 assert(f);
1896 assert(p > 0);
1897
1898 if (p == needle)
1899 return TEST_FOUND;
1900 else if (p < needle)
1901 return TEST_LEFT;
1902 else
1903 return TEST_RIGHT;
1904}
1905
de190aef
LP
1906static int test_object_seqnum(JournalFile *f, uint64_t p, uint64_t needle) {
1907 Object *o;
1908 int r;
1909
1910 assert(f);
1911 assert(p > 0);
1912
1913 r = journal_file_move_to_object(f, OBJECT_ENTRY, p, &o);
cec736d2
LP
1914 if (r < 0)
1915 return r;
1916
de190aef
LP
1917 if (le64toh(o->entry.seqnum) == needle)
1918 return TEST_FOUND;
1919 else if (le64toh(o->entry.seqnum) < needle)
1920 return TEST_LEFT;
1921 else
1922 return TEST_RIGHT;
1923}
cec736d2 1924
de190aef
LP
1925int journal_file_move_to_entry_by_seqnum(
1926 JournalFile *f,
1927 uint64_t seqnum,
1928 direction_t direction,
1929 Object **ret,
1930 uint64_t *offset) {
1931
1932 return generic_array_bisect(f,
1933 le64toh(f->header->entry_array_offset),
1934 le64toh(f->header->n_entries),
1935 seqnum,
1936 test_object_seqnum,
1937 direction,
1938 ret, offset, NULL);
1939}
cec736d2 1940
de190aef
LP
1941static int test_object_realtime(JournalFile *f, uint64_t p, uint64_t needle) {
1942 Object *o;
1943 int r;
1944
1945 assert(f);
1946 assert(p > 0);
1947
1948 r = journal_file_move_to_object(f, OBJECT_ENTRY, p, &o);
1949 if (r < 0)
1950 return r;
1951
1952 if (le64toh(o->entry.realtime) == needle)
1953 return TEST_FOUND;
1954 else if (le64toh(o->entry.realtime) < needle)
1955 return TEST_LEFT;
1956 else
1957 return TEST_RIGHT;
cec736d2
LP
1958}
1959
de190aef
LP
1960int journal_file_move_to_entry_by_realtime(
1961 JournalFile *f,
1962 uint64_t realtime,
1963 direction_t direction,
1964 Object **ret,
1965 uint64_t *offset) {
1966
1967 return generic_array_bisect(f,
1968 le64toh(f->header->entry_array_offset),
1969 le64toh(f->header->n_entries),
1970 realtime,
1971 test_object_realtime,
1972 direction,
1973 ret, offset, NULL);
1974}
1975
1976static int test_object_monotonic(JournalFile *f, uint64_t p, uint64_t needle) {
1977 Object *o;
1978 int r;
1979
1980 assert(f);
1981 assert(p > 0);
1982
1983 r = journal_file_move_to_object(f, OBJECT_ENTRY, p, &o);
1984 if (r < 0)
1985 return r;
1986
1987 if (le64toh(o->entry.monotonic) == needle)
1988 return TEST_FOUND;
1989 else if (le64toh(o->entry.monotonic) < needle)
1990 return TEST_LEFT;
1991 else
1992 return TEST_RIGHT;
1993}
1994
2a560338 1995static int find_data_object_by_boot_id(
47838ab3
ZJS
1996 JournalFile *f,
1997 sd_id128_t boot_id,
1998 Object **o,
1999 uint64_t *b) {
2a560338 2000
47838ab3
ZJS
2001 char t[sizeof("_BOOT_ID=")-1 + 32 + 1] = "_BOOT_ID=";
2002
2003 sd_id128_to_string(boot_id, t + 9);
2004 return journal_file_find_data_object(f, t, sizeof(t) - 1, o, b);
2005}
2006
de190aef
LP
2007int journal_file_move_to_entry_by_monotonic(
2008 JournalFile *f,
2009 sd_id128_t boot_id,
2010 uint64_t monotonic,
2011 direction_t direction,
2012 Object **ret,
2013 uint64_t *offset) {
2014
de190aef
LP
2015 Object *o;
2016 int r;
2017
cbdca852 2018 assert(f);
de190aef 2019
47838ab3 2020 r = find_data_object_by_boot_id(f, boot_id, &o, NULL);
de190aef
LP
2021 if (r < 0)
2022 return r;
cbdca852 2023 if (r == 0)
de190aef
LP
2024 return -ENOENT;
2025
2026 return generic_array_bisect_plus_one(f,
2027 le64toh(o->data.entry_offset),
2028 le64toh(o->data.entry_array_offset),
2029 le64toh(o->data.n_entries),
2030 monotonic,
2031 test_object_monotonic,
2032 direction,
2033 ret, offset, NULL);
2034}
2035
1fc605b0 2036void journal_file_reset_location(JournalFile *f) {
6573ef05 2037 f->location_type = LOCATION_HEAD;
1fc605b0 2038 f->current_offset = 0;
6573ef05
MS
2039 f->current_seqnum = 0;
2040 f->current_realtime = 0;
2041 f->current_monotonic = 0;
2042 zero(f->current_boot_id);
2043 f->current_xor_hash = 0;
2044}
2045
950c07d4 2046void journal_file_save_location(JournalFile *f, Object *o, uint64_t offset) {
6573ef05
MS
2047 f->location_type = LOCATION_SEEK;
2048 f->current_offset = offset;
2049 f->current_seqnum = le64toh(o->entry.seqnum);
2050 f->current_realtime = le64toh(o->entry.realtime);
2051 f->current_monotonic = le64toh(o->entry.monotonic);
2052 f->current_boot_id = o->entry.boot_id;
2053 f->current_xor_hash = le64toh(o->entry.xor_hash);
1fc605b0
MS
2054}
2055
d8ae66d7
MS
2056int journal_file_compare_locations(JournalFile *af, JournalFile *bf) {
2057 assert(af);
2058 assert(bf);
2059 assert(af->location_type == LOCATION_SEEK);
2060 assert(bf->location_type == LOCATION_SEEK);
2061
2062 /* If contents and timestamps match, these entries are
2063 * identical, even if the seqnum does not match */
2064 if (sd_id128_equal(af->current_boot_id, bf->current_boot_id) &&
2065 af->current_monotonic == bf->current_monotonic &&
2066 af->current_realtime == bf->current_realtime &&
2067 af->current_xor_hash == bf->current_xor_hash)
2068 return 0;
2069
2070 if (sd_id128_equal(af->header->seqnum_id, bf->header->seqnum_id)) {
2071
2072 /* If this is from the same seqnum source, compare
2073 * seqnums */
2074 if (af->current_seqnum < bf->current_seqnum)
2075 return -1;
2076 if (af->current_seqnum > bf->current_seqnum)
2077 return 1;
2078
2079 /* Wow! This is weird, different data but the same
2080 * seqnums? Something is borked, but let's make the
2081 * best of it and compare by time. */
2082 }
2083
2084 if (sd_id128_equal(af->current_boot_id, bf->current_boot_id)) {
2085
2086 /* If the boot id matches, compare monotonic time */
2087 if (af->current_monotonic < bf->current_monotonic)
2088 return -1;
2089 if (af->current_monotonic > bf->current_monotonic)
2090 return 1;
2091 }
2092
2093 /* Otherwise, compare UTC time */
2094 if (af->current_realtime < bf->current_realtime)
2095 return -1;
2096 if (af->current_realtime > bf->current_realtime)
2097 return 1;
2098
2099 /* Finally, compare by contents */
2100 if (af->current_xor_hash < bf->current_xor_hash)
2101 return -1;
2102 if (af->current_xor_hash > bf->current_xor_hash)
2103 return 1;
2104
2105 return 0;
2106}
2107
de190aef
LP
2108int journal_file_next_entry(
2109 JournalFile *f,
f534928a 2110 uint64_t p,
de190aef
LP
2111 direction_t direction,
2112 Object **ret, uint64_t *offset) {
2113
fb099c8d 2114 uint64_t i, n, ofs;
cec736d2
LP
2115 int r;
2116
2117 assert(f);
de190aef
LP
2118
2119 n = le64toh(f->header->n_entries);
2120 if (n <= 0)
2121 return 0;
cec736d2 2122
f534928a 2123 if (p == 0)
de190aef 2124 i = direction == DIRECTION_DOWN ? 0 : n - 1;
cec736d2 2125 else {
de190aef
LP
2126 r = generic_array_bisect(f,
2127 le64toh(f->header->entry_array_offset),
2128 le64toh(f->header->n_entries),
2129 p,
2130 test_object_offset,
2131 DIRECTION_DOWN,
2132 NULL, NULL,
2133 &i);
2134 if (r <= 0)
2135 return r;
2136
2137 if (direction == DIRECTION_DOWN) {
2138 if (i >= n - 1)
2139 return 0;
2140
2141 i++;
2142 } else {
2143 if (i <= 0)
2144 return 0;
2145
2146 i--;
2147 }
cec736d2
LP
2148 }
2149
de190aef 2150 /* And jump to it */
fb099c8d
ZJS
2151 r = generic_array_get(f,
2152 le64toh(f->header->entry_array_offset),
2153 i,
2154 ret, &ofs);
2155 if (r <= 0)
2156 return r;
2157
2158 if (p > 0 &&
2159 (direction == DIRECTION_DOWN ? ofs <= p : ofs >= p)) {
2160 log_debug("%s: entry array corrupted at entry %"PRIu64,
2161 f->path, i);
2162 return -EBADMSG;
2163 }
2164
2165 if (offset)
2166 *offset = ofs;
2167
2168 return 1;
de190aef 2169}
cec736d2 2170
de190aef
LP
2171int journal_file_next_entry_for_data(
2172 JournalFile *f,
2173 Object *o, uint64_t p,
2174 uint64_t data_offset,
2175 direction_t direction,
2176 Object **ret, uint64_t *offset) {
2177
2178 uint64_t n, i;
cec736d2 2179 int r;
de190aef 2180 Object *d;
cec736d2
LP
2181
2182 assert(f);
de190aef 2183 assert(p > 0 || !o);
cec736d2 2184
de190aef 2185 r = journal_file_move_to_object(f, OBJECT_DATA, data_offset, &d);
466ccd92 2186 if (r < 0)
de190aef 2187 return r;
cec736d2 2188
de190aef
LP
2189 n = le64toh(d->data.n_entries);
2190 if (n <= 0)
2191 return n;
cec736d2 2192
de190aef
LP
2193 if (!o)
2194 i = direction == DIRECTION_DOWN ? 0 : n - 1;
2195 else {
2196 if (o->object.type != OBJECT_ENTRY)
2197 return -EINVAL;
cec736d2 2198
de190aef
LP
2199 r = generic_array_bisect_plus_one(f,
2200 le64toh(d->data.entry_offset),
2201 le64toh(d->data.entry_array_offset),
2202 le64toh(d->data.n_entries),
2203 p,
2204 test_object_offset,
2205 DIRECTION_DOWN,
2206 NULL, NULL,
2207 &i);
2208
2209 if (r <= 0)
cec736d2
LP
2210 return r;
2211
de190aef
LP
2212 if (direction == DIRECTION_DOWN) {
2213 if (i >= n - 1)
2214 return 0;
cec736d2 2215
de190aef
LP
2216 i++;
2217 } else {
2218 if (i <= 0)
2219 return 0;
cec736d2 2220
de190aef
LP
2221 i--;
2222 }
cec736d2 2223
de190aef 2224 }
cec736d2 2225
de190aef
LP
2226 return generic_array_get_plus_one(f,
2227 le64toh(d->data.entry_offset),
2228 le64toh(d->data.entry_array_offset),
2229 i,
2230 ret, offset);
2231}
cec736d2 2232
cbdca852
LP
2233int journal_file_move_to_entry_by_offset_for_data(
2234 JournalFile *f,
2235 uint64_t data_offset,
2236 uint64_t p,
2237 direction_t direction,
2238 Object **ret, uint64_t *offset) {
2239
2240 int r;
2241 Object *d;
2242
2243 assert(f);
2244
2245 r = journal_file_move_to_object(f, OBJECT_DATA, data_offset, &d);
2246 if (r < 0)
2247 return r;
2248
2249 return generic_array_bisect_plus_one(f,
2250 le64toh(d->data.entry_offset),
2251 le64toh(d->data.entry_array_offset),
2252 le64toh(d->data.n_entries),
2253 p,
2254 test_object_offset,
2255 direction,
2256 ret, offset, NULL);
2257}
2258
2259int journal_file_move_to_entry_by_monotonic_for_data(
2260 JournalFile *f,
2261 uint64_t data_offset,
2262 sd_id128_t boot_id,
2263 uint64_t monotonic,
2264 direction_t direction,
2265 Object **ret, uint64_t *offset) {
2266
cbdca852
LP
2267 Object *o, *d;
2268 int r;
2269 uint64_t b, z;
2270
2271 assert(f);
2272
2273 /* First, seek by time */
47838ab3 2274 r = find_data_object_by_boot_id(f, boot_id, &o, &b);
cbdca852
LP
2275 if (r < 0)
2276 return r;
2277 if (r == 0)
2278 return -ENOENT;
2279
2280 r = generic_array_bisect_plus_one(f,
2281 le64toh(o->data.entry_offset),
2282 le64toh(o->data.entry_array_offset),
2283 le64toh(o->data.n_entries),
2284 monotonic,
2285 test_object_monotonic,
2286 direction,
2287 NULL, &z, NULL);
2288 if (r <= 0)
2289 return r;
2290
2291 /* And now, continue seeking until we find an entry that
2292 * exists in both bisection arrays */
2293
2294 for (;;) {
2295 Object *qo;
2296 uint64_t p, q;
2297
2298 r = journal_file_move_to_object(f, OBJECT_DATA, data_offset, &d);
2299 if (r < 0)
2300 return r;
2301
2302 r = generic_array_bisect_plus_one(f,
2303 le64toh(d->data.entry_offset),
2304 le64toh(d->data.entry_array_offset),
2305 le64toh(d->data.n_entries),
2306 z,
2307 test_object_offset,
2308 direction,
2309 NULL, &p, NULL);
2310 if (r <= 0)
2311 return r;
2312
2313 r = journal_file_move_to_object(f, OBJECT_DATA, b, &o);
2314 if (r < 0)
2315 return r;
2316
2317 r = generic_array_bisect_plus_one(f,
2318 le64toh(o->data.entry_offset),
2319 le64toh(o->data.entry_array_offset),
2320 le64toh(o->data.n_entries),
2321 p,
2322 test_object_offset,
2323 direction,
2324 &qo, &q, NULL);
2325
2326 if (r <= 0)
2327 return r;
2328
2329 if (p == q) {
2330 if (ret)
2331 *ret = qo;
2332 if (offset)
2333 *offset = q;
2334
2335 return 1;
2336 }
2337
2338 z = q;
2339 }
cbdca852
LP
2340}
2341
de190aef
LP
2342int journal_file_move_to_entry_by_seqnum_for_data(
2343 JournalFile *f,
2344 uint64_t data_offset,
2345 uint64_t seqnum,
2346 direction_t direction,
2347 Object **ret, uint64_t *offset) {
cec736d2 2348
de190aef
LP
2349 Object *d;
2350 int r;
cec736d2 2351
91a31dde
LP
2352 assert(f);
2353
de190aef 2354 r = journal_file_move_to_object(f, OBJECT_DATA, data_offset, &d);
91a31dde 2355 if (r < 0)
de190aef 2356 return r;
cec736d2 2357
de190aef
LP
2358 return generic_array_bisect_plus_one(f,
2359 le64toh(d->data.entry_offset),
2360 le64toh(d->data.entry_array_offset),
2361 le64toh(d->data.n_entries),
2362 seqnum,
2363 test_object_seqnum,
2364 direction,
2365 ret, offset, NULL);
2366}
cec736d2 2367
de190aef
LP
2368int journal_file_move_to_entry_by_realtime_for_data(
2369 JournalFile *f,
2370 uint64_t data_offset,
2371 uint64_t realtime,
2372 direction_t direction,
2373 Object **ret, uint64_t *offset) {
2374
2375 Object *d;
2376 int r;
2377
91a31dde
LP
2378 assert(f);
2379
de190aef 2380 r = journal_file_move_to_object(f, OBJECT_DATA, data_offset, &d);
91a31dde 2381 if (r < 0)
de190aef
LP
2382 return r;
2383
2384 return generic_array_bisect_plus_one(f,
2385 le64toh(d->data.entry_offset),
2386 le64toh(d->data.entry_array_offset),
2387 le64toh(d->data.n_entries),
2388 realtime,
2389 test_object_realtime,
2390 direction,
2391 ret, offset, NULL);
cec736d2
LP
2392}
2393
0284adc6 2394void journal_file_dump(JournalFile *f) {
7560fffc 2395 Object *o;
7560fffc 2396 int r;
0284adc6 2397 uint64_t p;
7560fffc
LP
2398
2399 assert(f);
2400
0284adc6 2401 journal_file_print_header(f);
7560fffc 2402
0284adc6
LP
2403 p = le64toh(f->header->header_size);
2404 while (p != 0) {
d05089d8 2405 r = journal_file_move_to_object(f, OBJECT_UNUSED, p, &o);
0284adc6
LP
2406 if (r < 0)
2407 goto fail;
7560fffc 2408
0284adc6 2409 switch (o->object.type) {
d98cc1f2 2410
0284adc6
LP
2411 case OBJECT_UNUSED:
2412 printf("Type: OBJECT_UNUSED\n");
2413 break;
d98cc1f2 2414
0284adc6
LP
2415 case OBJECT_DATA:
2416 printf("Type: OBJECT_DATA\n");
2417 break;
7560fffc 2418
3c1668da
LP
2419 case OBJECT_FIELD:
2420 printf("Type: OBJECT_FIELD\n");
2421 break;
2422
0284adc6 2423 case OBJECT_ENTRY:
507f22bd
ZJS
2424 printf("Type: OBJECT_ENTRY seqnum=%"PRIu64" monotonic=%"PRIu64" realtime=%"PRIu64"\n",
2425 le64toh(o->entry.seqnum),
2426 le64toh(o->entry.monotonic),
2427 le64toh(o->entry.realtime));
0284adc6 2428 break;
7560fffc 2429
0284adc6
LP
2430 case OBJECT_FIELD_HASH_TABLE:
2431 printf("Type: OBJECT_FIELD_HASH_TABLE\n");
2432 break;
7560fffc 2433
0284adc6
LP
2434 case OBJECT_DATA_HASH_TABLE:
2435 printf("Type: OBJECT_DATA_HASH_TABLE\n");
2436 break;
7560fffc 2437
0284adc6
LP
2438 case OBJECT_ENTRY_ARRAY:
2439 printf("Type: OBJECT_ENTRY_ARRAY\n");
2440 break;
7560fffc 2441
0284adc6 2442 case OBJECT_TAG:
507f22bd
ZJS
2443 printf("Type: OBJECT_TAG seqnum=%"PRIu64" epoch=%"PRIu64"\n",
2444 le64toh(o->tag.seqnum),
2445 le64toh(o->tag.epoch));
0284adc6 2446 break;
3c1668da
LP
2447
2448 default:
8facc349 2449 printf("Type: unknown (%i)\n", o->object.type);
3c1668da 2450 break;
0284adc6 2451 }
7560fffc 2452
d89c8fdf
ZJS
2453 if (o->object.flags & OBJECT_COMPRESSION_MASK)
2454 printf("Flags: %s\n",
2455 object_compressed_to_string(o->object.flags & OBJECT_COMPRESSION_MASK));
7560fffc 2456
0284adc6
LP
2457 if (p == le64toh(f->header->tail_object_offset))
2458 p = 0;
2459 else
2460 p = p + ALIGN64(le64toh(o->object.size));
2461 }
7560fffc 2462
0284adc6
LP
2463 return;
2464fail:
2465 log_error("File corrupt");
7560fffc
LP
2466}
2467
718fe4b1
ZJS
2468static const char* format_timestamp_safe(char *buf, size_t l, usec_t t) {
2469 const char *x;
2470
2471 x = format_timestamp(buf, l, t);
2472 if (x)
2473 return x;
2474 return " --- ";
2475}
2476
0284adc6 2477void journal_file_print_header(JournalFile *f) {
2765b7bb 2478 char a[33], b[33], c[33], d[33];
ed375beb 2479 char x[FORMAT_TIMESTAMP_MAX], y[FORMAT_TIMESTAMP_MAX], z[FORMAT_TIMESTAMP_MAX];
a1a03e30
LP
2480 struct stat st;
2481 char bytes[FORMAT_BYTES_MAX];
7560fffc
LP
2482
2483 assert(f);
7560fffc 2484
0284adc6
LP
2485 printf("File Path: %s\n"
2486 "File ID: %s\n"
2487 "Machine ID: %s\n"
2488 "Boot ID: %s\n"
2489 "Sequential Number ID: %s\n"
2490 "State: %s\n"
2491 "Compatible Flags:%s%s\n"
d89c8fdf 2492 "Incompatible Flags:%s%s%s\n"
507f22bd
ZJS
2493 "Header size: %"PRIu64"\n"
2494 "Arena size: %"PRIu64"\n"
2495 "Data Hash Table Size: %"PRIu64"\n"
2496 "Field Hash Table Size: %"PRIu64"\n"
0284adc6 2497 "Rotate Suggested: %s\n"
507f22bd
ZJS
2498 "Head Sequential Number: %"PRIu64"\n"
2499 "Tail Sequential Number: %"PRIu64"\n"
0284adc6 2500 "Head Realtime Timestamp: %s\n"
3223f44f 2501 "Tail Realtime Timestamp: %s\n"
ed375beb 2502 "Tail Monotonic Timestamp: %s\n"
507f22bd
ZJS
2503 "Objects: %"PRIu64"\n"
2504 "Entry Objects: %"PRIu64"\n",
0284adc6
LP
2505 f->path,
2506 sd_id128_to_string(f->header->file_id, a),
2507 sd_id128_to_string(f->header->machine_id, b),
2508 sd_id128_to_string(f->header->boot_id, c),
2765b7bb 2509 sd_id128_to_string(f->header->seqnum_id, d),
3223f44f
LP
2510 f->header->state == STATE_OFFLINE ? "OFFLINE" :
2511 f->header->state == STATE_ONLINE ? "ONLINE" :
2512 f->header->state == STATE_ARCHIVED ? "ARCHIVED" : "UNKNOWN",
8088cbd3 2513 JOURNAL_HEADER_SEALED(f->header) ? " SEALED" : "",
d89c8fdf
ZJS
2514 (le32toh(f->header->compatible_flags) & ~HEADER_COMPATIBLE_ANY) ? " ???" : "",
2515 JOURNAL_HEADER_COMPRESSED_XZ(f->header) ? " COMPRESSED-XZ" : "",
2516 JOURNAL_HEADER_COMPRESSED_LZ4(f->header) ? " COMPRESSED-LZ4" : "",
2517 (le32toh(f->header->incompatible_flags) & ~HEADER_INCOMPATIBLE_ANY) ? " ???" : "",
507f22bd
ZJS
2518 le64toh(f->header->header_size),
2519 le64toh(f->header->arena_size),
2520 le64toh(f->header->data_hash_table_size) / sizeof(HashItem),
2521 le64toh(f->header->field_hash_table_size) / sizeof(HashItem),
fb0951b0 2522 yes_no(journal_file_rotate_suggested(f, 0)),
507f22bd
ZJS
2523 le64toh(f->header->head_entry_seqnum),
2524 le64toh(f->header->tail_entry_seqnum),
718fe4b1
ZJS
2525 format_timestamp_safe(x, sizeof(x), le64toh(f->header->head_entry_realtime)),
2526 format_timestamp_safe(y, sizeof(y), le64toh(f->header->tail_entry_realtime)),
ed375beb 2527 format_timespan(z, sizeof(z), le64toh(f->header->tail_entry_monotonic), USEC_PER_MSEC),
507f22bd
ZJS
2528 le64toh(f->header->n_objects),
2529 le64toh(f->header->n_entries));
7560fffc 2530
0284adc6 2531 if (JOURNAL_HEADER_CONTAINS(f->header, n_data))
507f22bd 2532 printf("Data Objects: %"PRIu64"\n"
0284adc6 2533 "Data Hash Table Fill: %.1f%%\n",
507f22bd 2534 le64toh(f->header->n_data),
0284adc6 2535 100.0 * (double) le64toh(f->header->n_data) / ((double) (le64toh(f->header->data_hash_table_size) / sizeof(HashItem))));
7560fffc 2536
0284adc6 2537 if (JOURNAL_HEADER_CONTAINS(f->header, n_fields))
507f22bd 2538 printf("Field Objects: %"PRIu64"\n"
0284adc6 2539 "Field Hash Table Fill: %.1f%%\n",
507f22bd 2540 le64toh(f->header->n_fields),
0284adc6 2541 100.0 * (double) le64toh(f->header->n_fields) / ((double) (le64toh(f->header->field_hash_table_size) / sizeof(HashItem))));
3223f44f
LP
2542
2543 if (JOURNAL_HEADER_CONTAINS(f->header, n_tags))
507f22bd
ZJS
2544 printf("Tag Objects: %"PRIu64"\n",
2545 le64toh(f->header->n_tags));
3223f44f 2546 if (JOURNAL_HEADER_CONTAINS(f->header, n_entry_arrays))
507f22bd
ZJS
2547 printf("Entry Array Objects: %"PRIu64"\n",
2548 le64toh(f->header->n_entry_arrays));
a1a03e30
LP
2549
2550 if (fstat(f->fd, &st) >= 0)
59f448cf 2551 printf("Disk usage: %s\n", format_bytes(bytes, sizeof(bytes), (uint64_t) st.st_blocks * 512ULL));
7560fffc
LP
2552}
2553
fc68c929
LP
2554static int journal_file_warn_btrfs(JournalFile *f) {
2555 unsigned attrs;
2556 int r;
2557
2558 assert(f);
2559
2560 /* Before we write anything, check if the COW logic is turned
2561 * off on btrfs. Given our write pattern that is quite
2562 * unfriendly to COW file systems this should greatly improve
2563 * performance on COW file systems, such as btrfs, at the
2564 * expense of data integrity features (which shouldn't be too
2565 * bad, given that we do our own checksumming). */
2566
2567 r = btrfs_is_filesystem(f->fd);
2568 if (r < 0)
2569 return log_warning_errno(r, "Failed to determine if journal is on btrfs: %m");
2570 if (!r)
2571 return 0;
2572
2573 r = read_attr_fd(f->fd, &attrs);
2574 if (r < 0)
2575 return log_warning_errno(r, "Failed to read file attributes: %m");
2576
2577 if (attrs & FS_NOCOW_FL) {
2578 log_debug("Detected btrfs file system with copy-on-write disabled, all is good.");
2579 return 0;
2580 }
2581
2582 log_notice("Creating journal file %s on a btrfs file system, and copy-on-write is enabled. "
2583 "This is likely to slow down journal access substantially, please consider turning "
2584 "off the copy-on-write file attribute on the journal directory, using chattr +C.", f->path);
2585
2586 return 1;
2587}
2588
0284adc6
LP
2589int journal_file_open(
2590 const char *fname,
2591 int flags,
2592 mode_t mode,
2593 bool compress,
baed47c3 2594 bool seal,
0284adc6
LP
2595 JournalMetrics *metrics,
2596 MMapCache *mmap_cache,
2597 JournalFile *template,
2598 JournalFile **ret) {
7560fffc 2599
fa6ac760 2600 bool newly_created = false;
0284adc6 2601 JournalFile *f;
fa6ac760 2602 void *h;
0284adc6 2603 int r;
7560fffc 2604
0284adc6 2605 assert(fname);
0559d3a5 2606 assert(ret);
7560fffc 2607
0284adc6
LP
2608 if ((flags & O_ACCMODE) != O_RDONLY &&
2609 (flags & O_ACCMODE) != O_RDWR)
2610 return -EINVAL;
7560fffc 2611
a0108012
LP
2612 if (!endswith(fname, ".journal") &&
2613 !endswith(fname, ".journal~"))
0284adc6 2614 return -EINVAL;
7560fffc 2615
0284adc6
LP
2616 f = new0(JournalFile, 1);
2617 if (!f)
2618 return -ENOMEM;
7560fffc 2619
0284adc6
LP
2620 f->fd = -1;
2621 f->mode = mode;
7560fffc 2622
0284adc6
LP
2623 f->flags = flags;
2624 f->prot = prot_from_flags(flags);
2625 f->writable = (flags & O_ACCMODE) != O_RDONLY;
92261977 2626#if defined(HAVE_LZ4)
d89c8fdf
ZJS
2627 f->compress_lz4 = compress;
2628#elif defined(HAVE_XZ)
2629 f->compress_xz = compress;
48b61739 2630#endif
49a32d43 2631#ifdef HAVE_GCRYPT
baed47c3 2632 f->seal = seal;
49a32d43 2633#endif
7560fffc 2634
0284adc6
LP
2635 if (mmap_cache)
2636 f->mmap = mmap_cache_ref(mmap_cache);
2637 else {
84168d80 2638 f->mmap = mmap_cache_new();
0284adc6
LP
2639 if (!f->mmap) {
2640 r = -ENOMEM;
2641 goto fail;
2642 }
2643 }
7560fffc 2644
0284adc6
LP
2645 f->path = strdup(fname);
2646 if (!f->path) {
2647 r = -ENOMEM;
2648 goto fail;
2649 }
7560fffc 2650
4743015d 2651 f->chain_cache = ordered_hashmap_new(&uint64_hash_ops);
a4bcff5b
LP
2652 if (!f->chain_cache) {
2653 r = -ENOMEM;
2654 goto fail;
2655 }
2656
0284adc6
LP
2657 f->fd = open(f->path, f->flags|O_CLOEXEC, f->mode);
2658 if (f->fd < 0) {
2659 r = -errno;
2660 goto fail;
7560fffc 2661 }
7560fffc 2662
2678031a
LP
2663 r = journal_file_fstat(f);
2664 if (r < 0)
0284adc6 2665 goto fail;
7560fffc 2666
0284adc6 2667 if (f->last_stat.st_size == 0 && f->writable) {
11689d2a 2668
fc68c929 2669 (void) journal_file_warn_btrfs(f);
11689d2a 2670
fb0951b0
LP
2671 /* Let's attach the creation time to the journal file,
2672 * so that the vacuuming code knows the age of this
2673 * file even if the file might end up corrupted one
2674 * day... Ideally we'd just use the creation time many
2675 * file systems maintain for each file, but there is
2676 * currently no usable API to query this, hence let's
2677 * emulate this via extended attributes. If extended
2678 * attributes are not supported we'll just skip this,
7517e174 2679 * and rely solely on mtime/atime/ctime of the file. */
fb0951b0 2680
d61b600d 2681 fd_setcrtime(f->fd, 0);
7560fffc 2682
feb12d3e 2683#ifdef HAVE_GCRYPT
0284adc6 2684 /* Try to load the FSPRG state, and if we can't, then
baed47c3 2685 * just don't do sealing */
49a32d43
LP
2686 if (f->seal) {
2687 r = journal_file_fss_load(f);
2688 if (r < 0)
2689 f->seal = false;
2690 }
feb12d3e 2691#endif
7560fffc 2692
0284adc6
LP
2693 r = journal_file_init_header(f, template);
2694 if (r < 0)
2695 goto fail;
7560fffc 2696
2678031a
LP
2697 r = journal_file_fstat(f);
2698 if (r < 0)
0284adc6 2699 goto fail;
fb0951b0
LP
2700
2701 newly_created = true;
0284adc6 2702 }
7560fffc 2703
0284adc6
LP
2704 if (f->last_stat.st_size < (off_t) HEADER_SIZE_MIN) {
2705 r = -EIO;
2706 goto fail;
2707 }
7560fffc 2708
fa6ac760 2709 r = mmap_cache_get(f->mmap, f->fd, f->prot, CONTEXT_HEADER, true, 0, PAGE_ALIGN(sizeof(Header)), &f->last_stat, &h);
977eaa1e 2710 if (r < 0)
0284adc6 2711 goto fail;
7560fffc 2712
fa6ac760
LP
2713 f->header = h;
2714
0284adc6
LP
2715 if (!newly_created) {
2716 r = journal_file_verify_header(f);
2717 if (r < 0)
2718 goto fail;
2719 }
7560fffc 2720
feb12d3e 2721#ifdef HAVE_GCRYPT
0284adc6 2722 if (!newly_created && f->writable) {
baed47c3 2723 r = journal_file_fss_load(f);
0284adc6
LP
2724 if (r < 0)
2725 goto fail;
2726 }
feb12d3e 2727#endif
cec736d2
LP
2728
2729 if (f->writable) {
4a92baf3
LP
2730 if (metrics) {
2731 journal_default_metrics(metrics, f->fd);
2732 f->metrics = *metrics;
2733 } else if (template)
2734 f->metrics = template->metrics;
2735
cec736d2
LP
2736 r = journal_file_refresh_header(f);
2737 if (r < 0)
2738 goto fail;
2739 }
2740
feb12d3e 2741#ifdef HAVE_GCRYPT
baed47c3 2742 r = journal_file_hmac_setup(f);
14d10188
LP
2743 if (r < 0)
2744 goto fail;
feb12d3e 2745#endif
14d10188 2746
cec736d2 2747 if (newly_created) {
de190aef 2748 r = journal_file_setup_field_hash_table(f);
cec736d2
LP
2749 if (r < 0)
2750 goto fail;
2751
de190aef 2752 r = journal_file_setup_data_hash_table(f);
cec736d2
LP
2753 if (r < 0)
2754 goto fail;
7560fffc 2755
feb12d3e 2756#ifdef HAVE_GCRYPT
7560fffc
LP
2757 r = journal_file_append_first_tag(f);
2758 if (r < 0)
2759 goto fail;
feb12d3e 2760#endif
cec736d2
LP
2761 }
2762
fa6ac760
LP
2763 if (mmap_cache_got_sigbus(f->mmap, f->fd)) {
2764 r = -EIO;
2765 goto fail;
2766 }
2767
0559d3a5 2768 *ret = f;
cec736d2
LP
2769 return 0;
2770
2771fail:
fa6ac760
LP
2772 if (f->fd >= 0 && mmap_cache_got_sigbus(f->mmap, f->fd))
2773 r = -EIO;
2774
cec736d2
LP
2775 journal_file_close(f);
2776
2777 return r;
2778}
0ac38b70 2779
baed47c3 2780int journal_file_rotate(JournalFile **f, bool compress, bool seal) {
57535f47 2781 _cleanup_free_ char *p = NULL;
0ac38b70
LP
2782 size_t l;
2783 JournalFile *old_file, *new_file = NULL;
2784 int r;
2785
2786 assert(f);
2787 assert(*f);
2788
2789 old_file = *f;
2790
2791 if (!old_file->writable)
2792 return -EINVAL;
2793
2794 if (!endswith(old_file->path, ".journal"))
2795 return -EINVAL;
2796
2797 l = strlen(old_file->path);
57535f47
ZJS
2798 r = asprintf(&p, "%.*s@" SD_ID128_FORMAT_STR "-%016"PRIx64"-%016"PRIx64".journal",
2799 (int) l - 8, old_file->path,
2800 SD_ID128_FORMAT_VAL(old_file->header->seqnum_id),
2801 le64toh((*f)->header->head_entry_seqnum),
2802 le64toh((*f)->header->head_entry_realtime));
2803 if (r < 0)
0ac38b70
LP
2804 return -ENOMEM;
2805
2678031a
LP
2806 /* Try to rename the file to the archived version. If the file
2807 * already was deleted, we'll get ENOENT, let's ignore that
2808 * case. */
0ac38b70 2809 r = rename(old_file->path, p);
2678031a 2810 if (r < 0 && errno != ENOENT)
0ac38b70
LP
2811 return -errno;
2812
ccdbaf91 2813 old_file->header->state = STATE_ARCHIVED;
0ac38b70 2814
f27a3864
LP
2815 /* Currently, btrfs is not very good with out write patterns
2816 * and fragments heavily. Let's defrag our journal files when
2817 * we archive them */
2818 old_file->defrag_on_close = true;
2819
baed47c3 2820 r = journal_file_open(old_file->path, old_file->flags, old_file->mode, compress, seal, NULL, old_file->mmap, old_file, &new_file);
0ac38b70
LP
2821 journal_file_close(old_file);
2822
2823 *f = new_file;
2824 return r;
2825}
2826
9447a7f1
LP
2827int journal_file_open_reliably(
2828 const char *fname,
2829 int flags,
2830 mode_t mode,
7560fffc 2831 bool compress,
baed47c3 2832 bool seal,
4a92baf3 2833 JournalMetrics *metrics,
27370278 2834 MMapCache *mmap_cache,
9447a7f1
LP
2835 JournalFile *template,
2836 JournalFile **ret) {
2837
2838 int r;
2839 size_t l;
ed375beb 2840 _cleanup_free_ char *p = NULL;
9447a7f1 2841
070052ab 2842 r = journal_file_open(fname, flags, mode, compress, seal, metrics, mmap_cache, template, ret);
288359db
ZJS
2843 if (!IN_SET(r,
2844 -EBADMSG, /* corrupted */
2845 -ENODATA, /* truncated */
2846 -EHOSTDOWN, /* other machine */
2847 -EPROTONOSUPPORT, /* incompatible feature */
2848 -EBUSY, /* unclean shutdown */
2849 -ESHUTDOWN, /* already archived */
2850 -EIO, /* IO error, including SIGBUS on mmap */
2851 -EIDRM /* File has been deleted */))
9447a7f1
LP
2852 return r;
2853
2854 if ((flags & O_ACCMODE) == O_RDONLY)
2855 return r;
2856
2857 if (!(flags & O_CREAT))
2858 return r;
2859
7560fffc
LP
2860 if (!endswith(fname, ".journal"))
2861 return r;
2862
5c70eab4
LP
2863 /* The file is corrupted. Rotate it away and try it again (but only once) */
2864
9447a7f1 2865 l = strlen(fname);
d587eca5 2866 if (asprintf(&p, "%.*s@%016"PRIx64 "-%016"PRIx64 ".journal~",
57535f47 2867 (int) l - 8, fname,
d587eca5 2868 now(CLOCK_REALTIME),
9bf3b535 2869 random_u64()) < 0)
9447a7f1
LP
2870 return -ENOMEM;
2871
65089b82 2872 if (rename(fname, p) < 0)
9447a7f1
LP
2873 return -errno;
2874
f27a3864
LP
2875 /* btrfs doesn't cope well with our write pattern and
2876 * fragments heavily. Let's defrag all files we rotate */
11689d2a
LP
2877
2878 (void) chattr_path(p, false, FS_NOCOW_FL);
f27a3864
LP
2879 (void) btrfs_defrag(p);
2880
65089b82 2881 log_warning_errno(r, "File %s corrupted or uncleanly shut down, renaming and replacing.", fname);
9447a7f1 2882
070052ab 2883 return journal_file_open(fname, flags, mode, compress, seal, metrics, mmap_cache, template, ret);
9447a7f1
LP
2884}
2885
cf244689
LP
2886int journal_file_copy_entry(JournalFile *from, JournalFile *to, Object *o, uint64_t p, uint64_t *seqnum, Object **ret, uint64_t *offset) {
2887 uint64_t i, n;
2888 uint64_t q, xor_hash = 0;
2889 int r;
2890 EntryItem *items;
2891 dual_timestamp ts;
2892
2893 assert(from);
2894 assert(to);
2895 assert(o);
2896 assert(p);
2897
2898 if (!to->writable)
2899 return -EPERM;
2900
2901 ts.monotonic = le64toh(o->entry.monotonic);
2902 ts.realtime = le64toh(o->entry.realtime);
2903
cf244689 2904 n = journal_file_entry_n_items(o);
4faa7004
TA
2905 /* alloca() can't take 0, hence let's allocate at least one */
2906 items = alloca(sizeof(EntryItem) * MAX(1u, n));
cf244689
LP
2907
2908 for (i = 0; i < n; i++) {
4fd052ae
FC
2909 uint64_t l, h;
2910 le64_t le_hash;
cf244689
LP
2911 size_t t;
2912 void *data;
2913 Object *u;
2914
2915 q = le64toh(o->entry.items[i].object_offset);
2916 le_hash = o->entry.items[i].hash;
2917
2918 r = journal_file_move_to_object(from, OBJECT_DATA, q, &o);
2919 if (r < 0)
2920 return r;
2921
2922 if (le_hash != o->data.hash)
2923 return -EBADMSG;
2924
2925 l = le64toh(o->object.size) - offsetof(Object, data.payload);
2926 t = (size_t) l;
2927
2928 /* We hit the limit on 32bit machines */
2929 if ((uint64_t) t != l)
2930 return -E2BIG;
2931
d89c8fdf 2932 if (o->object.flags & OBJECT_COMPRESSION_MASK) {
3b1a55e1 2933#if defined(HAVE_XZ) || defined(HAVE_LZ4)
a7f7d1bd 2934 size_t rsize = 0;
cf244689 2935
d89c8fdf
ZJS
2936 r = decompress_blob(o->object.flags & OBJECT_COMPRESSION_MASK,
2937 o->data.payload, l, &from->compress_buffer, &from->compress_buffer_size, &rsize, 0);
2938 if (r < 0)
2939 return r;
cf244689
LP
2940
2941 data = from->compress_buffer;
2942 l = rsize;
3b1a55e1
ZJS
2943#else
2944 return -EPROTONOSUPPORT;
2945#endif
cf244689
LP
2946 } else
2947 data = o->data.payload;
2948
2949 r = journal_file_append_data(to, data, l, &u, &h);
2950 if (r < 0)
2951 return r;
2952
2953 xor_hash ^= le64toh(u->data.hash);
2954 items[i].object_offset = htole64(h);
2955 items[i].hash = u->data.hash;
2956
2957 r = journal_file_move_to_object(from, OBJECT_ENTRY, p, &o);
2958 if (r < 0)
2959 return r;
2960 }
2961
fa6ac760
LP
2962 r = journal_file_append_entry_internal(to, &ts, xor_hash, items, n, seqnum, ret, offset);
2963
2964 if (mmap_cache_got_sigbus(to->mmap, to->fd))
2965 return -EIO;
2966
2967 return r;
cf244689 2968}
babfc091 2969
8580d1f7
LP
2970void journal_reset_metrics(JournalMetrics *m) {
2971 assert(m);
2972
2973 /* Set everything to "pick automatic values". */
2974
2975 *m = (JournalMetrics) {
2976 .min_use = (uint64_t) -1,
2977 .max_use = (uint64_t) -1,
2978 .min_size = (uint64_t) -1,
2979 .max_size = (uint64_t) -1,
2980 .keep_free = (uint64_t) -1,
2981 .n_max_files = (uint64_t) -1,
2982 };
2983}
2984
babfc091 2985void journal_default_metrics(JournalMetrics *m, int fd) {
8580d1f7 2986 char a[FORMAT_BYTES_MAX], b[FORMAT_BYTES_MAX], c[FORMAT_BYTES_MAX], d[FORMAT_BYTES_MAX], e[FORMAT_BYTES_MAX];
babfc091 2987 struct statvfs ss;
8580d1f7 2988 uint64_t fs_size;
babfc091
LP
2989
2990 assert(m);
2991 assert(fd >= 0);
2992
2993 if (fstatvfs(fd, &ss) >= 0)
2994 fs_size = ss.f_frsize * ss.f_blocks;
8580d1f7
LP
2995 else {
2996 log_debug_errno(errno, "Failed to detremine disk size: %m");
2997 fs_size = 0;
2998 }
babfc091
LP
2999
3000 if (m->max_use == (uint64_t) -1) {
3001
3002 if (fs_size > 0) {
3003 m->max_use = PAGE_ALIGN(fs_size / 10); /* 10% of file system size */
3004
3005 if (m->max_use > DEFAULT_MAX_USE_UPPER)
3006 m->max_use = DEFAULT_MAX_USE_UPPER;
3007
3008 if (m->max_use < DEFAULT_MAX_USE_LOWER)
3009 m->max_use = DEFAULT_MAX_USE_LOWER;
3010 } else
3011 m->max_use = DEFAULT_MAX_USE_LOWER;
3012 } else {
3013 m->max_use = PAGE_ALIGN(m->max_use);
3014
8580d1f7 3015 if (m->max_use != 0 && m->max_use < JOURNAL_FILE_SIZE_MIN*2)
babfc091
LP
3016 m->max_use = JOURNAL_FILE_SIZE_MIN*2;
3017 }
3018
8580d1f7
LP
3019 if (m->min_use == (uint64_t) -1)
3020 m->min_use = DEFAULT_MIN_USE;
3021
3022 if (m->min_use > m->max_use)
3023 m->min_use = m->max_use;
3024
babfc091
LP
3025 if (m->max_size == (uint64_t) -1) {
3026 m->max_size = PAGE_ALIGN(m->max_use / 8); /* 8 chunks */
3027
3028 if (m->max_size > DEFAULT_MAX_SIZE_UPPER)
3029 m->max_size = DEFAULT_MAX_SIZE_UPPER;
3030 } else
3031 m->max_size = PAGE_ALIGN(m->max_size);
3032
8580d1f7
LP
3033 if (m->max_size != 0) {
3034 if (m->max_size < JOURNAL_FILE_SIZE_MIN)
3035 m->max_size = JOURNAL_FILE_SIZE_MIN;
babfc091 3036
8580d1f7
LP
3037 if (m->max_use != 0 && m->max_size*2 > m->max_use)
3038 m->max_use = m->max_size*2;
3039 }
babfc091
LP
3040
3041 if (m->min_size == (uint64_t) -1)
3042 m->min_size = JOURNAL_FILE_SIZE_MIN;
3043 else {
3044 m->min_size = PAGE_ALIGN(m->min_size);
3045
3046 if (m->min_size < JOURNAL_FILE_SIZE_MIN)
3047 m->min_size = JOURNAL_FILE_SIZE_MIN;
3048
8580d1f7 3049 if (m->max_size != 0 && m->min_size > m->max_size)
babfc091
LP
3050 m->max_size = m->min_size;
3051 }
3052
3053 if (m->keep_free == (uint64_t) -1) {
3054
3055 if (fs_size > 0) {
8621b110 3056 m->keep_free = PAGE_ALIGN(fs_size * 3 / 20); /* 15% of file system size */
babfc091
LP
3057
3058 if (m->keep_free > DEFAULT_KEEP_FREE_UPPER)
3059 m->keep_free = DEFAULT_KEEP_FREE_UPPER;
3060
3061 } else
3062 m->keep_free = DEFAULT_KEEP_FREE;
3063 }
3064
8580d1f7
LP
3065 if (m->n_max_files == (uint64_t) -1)
3066 m->n_max_files = DEFAULT_N_MAX_FILES;
3067
3068 log_debug("Fixed min_use=%s max_use=%s max_size=%s min_size=%s keep_free=%s n_max_files=%" PRIu64,
3069 format_bytes(a, sizeof(a), m->min_use),
3070 format_bytes(b, sizeof(b), m->max_use),
3071 format_bytes(c, sizeof(c), m->max_size),
3072 format_bytes(d, sizeof(d), m->min_size),
3073 format_bytes(e, sizeof(e), m->keep_free),
3074 m->n_max_files);
babfc091 3075}
08984293
LP
3076
3077int journal_file_get_cutoff_realtime_usec(JournalFile *f, usec_t *from, usec_t *to) {
08984293
LP
3078 assert(f);
3079 assert(from || to);
3080
3081 if (from) {
162566a4
LP
3082 if (f->header->head_entry_realtime == 0)
3083 return -ENOENT;
08984293 3084
162566a4 3085 *from = le64toh(f->header->head_entry_realtime);
08984293
LP
3086 }
3087
3088 if (to) {
162566a4
LP
3089 if (f->header->tail_entry_realtime == 0)
3090 return -ENOENT;
08984293 3091
162566a4 3092 *to = le64toh(f->header->tail_entry_realtime);
08984293
LP
3093 }
3094
3095 return 1;
3096}
3097
3098int journal_file_get_cutoff_monotonic_usec(JournalFile *f, sd_id128_t boot_id, usec_t *from, usec_t *to) {
08984293
LP
3099 Object *o;
3100 uint64_t p;
3101 int r;
3102
3103 assert(f);
3104 assert(from || to);
3105
47838ab3 3106 r = find_data_object_by_boot_id(f, boot_id, &o, &p);
08984293
LP
3107 if (r <= 0)
3108 return r;
3109
3110 if (le64toh(o->data.n_entries) <= 0)
3111 return 0;
3112
3113 if (from) {
3114 r = journal_file_move_to_object(f, OBJECT_ENTRY, le64toh(o->data.entry_offset), &o);
3115 if (r < 0)
3116 return r;
3117
3118 *from = le64toh(o->entry.monotonic);
3119 }
3120
3121 if (to) {
3122 r = journal_file_move_to_object(f, OBJECT_DATA, p, &o);
3123 if (r < 0)
3124 return r;
3125
3126 r = generic_array_get_plus_one(f,
3127 le64toh(o->data.entry_offset),
3128 le64toh(o->data.entry_array_offset),
3129 le64toh(o->data.n_entries)-1,
3130 &o, NULL);
3131 if (r <= 0)
3132 return r;
3133
3134 *to = le64toh(o->entry.monotonic);
3135 }
3136
3137 return 1;
3138}
dca6219e 3139
fb0951b0 3140bool journal_file_rotate_suggested(JournalFile *f, usec_t max_file_usec) {
dca6219e
LP
3141 assert(f);
3142
3143 /* If we gained new header fields we gained new features,
3144 * hence suggest a rotation */
361f9cbc
LP
3145 if (le64toh(f->header->header_size) < sizeof(Header)) {
3146 log_debug("%s uses an outdated header, suggesting rotation.", f->path);
dca6219e 3147 return true;
361f9cbc 3148 }
dca6219e
LP
3149
3150 /* Let's check if the hash tables grew over a certain fill
3151 * level (75%, borrowing this value from Java's hash table
3152 * implementation), and if so suggest a rotation. To calculate
3153 * the fill level we need the n_data field, which only exists
3154 * in newer versions. */
3155
3156 if (JOURNAL_HEADER_CONTAINS(f->header, n_data))
361f9cbc 3157 if (le64toh(f->header->n_data) * 4ULL > (le64toh(f->header->data_hash_table_size) / sizeof(HashItem)) * 3ULL) {
507f22bd 3158 log_debug("Data hash table of %s has a fill level at %.1f (%"PRIu64" of %"PRIu64" items, %llu file size, %"PRIu64" bytes per hash table item), suggesting rotation.",
361f9cbc
LP
3159 f->path,
3160 100.0 * (double) le64toh(f->header->n_data) / ((double) (le64toh(f->header->data_hash_table_size) / sizeof(HashItem))),
507f22bd
ZJS
3161 le64toh(f->header->n_data),
3162 le64toh(f->header->data_hash_table_size) / sizeof(HashItem),
3163 (unsigned long long) f->last_stat.st_size,
3164 f->last_stat.st_size / le64toh(f->header->n_data));
dca6219e 3165 return true;
361f9cbc 3166 }
dca6219e
LP
3167
3168 if (JOURNAL_HEADER_CONTAINS(f->header, n_fields))
361f9cbc 3169 if (le64toh(f->header->n_fields) * 4ULL > (le64toh(f->header->field_hash_table_size) / sizeof(HashItem)) * 3ULL) {
507f22bd 3170 log_debug("Field hash table of %s has a fill level at %.1f (%"PRIu64" of %"PRIu64" items), suggesting rotation.",
361f9cbc
LP
3171 f->path,
3172 100.0 * (double) le64toh(f->header->n_fields) / ((double) (le64toh(f->header->field_hash_table_size) / sizeof(HashItem))),
507f22bd
ZJS
3173 le64toh(f->header->n_fields),
3174 le64toh(f->header->field_hash_table_size) / sizeof(HashItem));
dca6219e 3175 return true;
361f9cbc 3176 }
dca6219e 3177
0598fd4a
LP
3178 /* Are the data objects properly indexed by field objects? */
3179 if (JOURNAL_HEADER_CONTAINS(f->header, n_data) &&
3180 JOURNAL_HEADER_CONTAINS(f->header, n_fields) &&
3181 le64toh(f->header->n_data) > 0 &&
3182 le64toh(f->header->n_fields) == 0)
3183 return true;
3184
fb0951b0
LP
3185 if (max_file_usec > 0) {
3186 usec_t t, h;
3187
3188 h = le64toh(f->header->head_entry_realtime);
3189 t = now(CLOCK_REALTIME);
3190
3191 if (h > 0 && t > h + max_file_usec)
3192 return true;
3193 }
3194
dca6219e
LP
3195 return false;
3196}