]> git.ipfire.org Git - thirdparty/systemd.git/blame - src/journal/journal-file.c
journal: sort data items of entries by offset
[thirdparty/systemd.git] / src / journal / journal-file.c
CommitLineData
cec736d2
LP
1/*-*- Mode: C; c-basic-offset: 8; indent-tabs-mode: nil -*-*/
2
3/***
4 This file is part of systemd.
5
6 Copyright 2011 Lennart Poettering
7
8 systemd is free software; you can redistribute it and/or modify it
5430f7f2
LP
9 under the terms of the GNU Lesser General Public License as published by
10 the Free Software Foundation; either version 2.1 of the License, or
cec736d2
LP
11 (at your option) any later version.
12
13 systemd is distributed in the hope that it will be useful, but
14 WITHOUT ANY WARRANTY; without even the implied warranty of
15 MERCHANTABILITY or FITNESS FOR A PARTICULAR PURPOSE. See the GNU
5430f7f2 16 Lesser General Public License for more details.
cec736d2 17
5430f7f2 18 You should have received a copy of the GNU Lesser General Public License
cec736d2
LP
19 along with systemd; If not, see <http://www.gnu.org/licenses/>.
20***/
21
22#include <sys/mman.h>
23#include <errno.h>
24#include <sys/uio.h>
25#include <unistd.h>
26#include <sys/statvfs.h>
27#include <fcntl.h>
28#include <stddef.h>
29
30#include "journal-def.h"
31#include "journal-file.h"
0284adc6 32#include "journal-authenticate.h"
cec736d2 33#include "lookup3.h"
807e17f0 34#include "compress.h"
7560fffc 35#include "fsprg.h"
cec736d2 36
4a92baf3
LP
37#define DEFAULT_DATA_HASH_TABLE_SIZE (2047ULL*sizeof(HashItem))
38#define DEFAULT_FIELD_HASH_TABLE_SIZE (333ULL*sizeof(HashItem))
cec736d2 39
be19b7df 40#define COMPRESSION_SIZE_THRESHOLD (512ULL)
807e17f0 41
babfc091 42/* This is the minimum journal file size */
b47ffcfd 43#define JOURNAL_FILE_SIZE_MIN (64ULL*1024ULL) /* 64 KiB */
babfc091
LP
44
45/* These are the lower and upper bounds if we deduce the max_use value
46 * from the file system size */
47#define DEFAULT_MAX_USE_LOWER (1ULL*1024ULL*1024ULL) /* 1 MiB */
48#define DEFAULT_MAX_USE_UPPER (4ULL*1024ULL*1024ULL*1024ULL) /* 4 GiB */
49
50/* This is the upper bound if we deduce max_size from max_use */
71100051 51#define DEFAULT_MAX_SIZE_UPPER (128ULL*1024ULL*1024ULL) /* 128 MiB */
babfc091
LP
52
53/* This is the upper bound if we deduce the keep_free value from the
54 * file system size */
55#define DEFAULT_KEEP_FREE_UPPER (4ULL*1024ULL*1024ULL*1024ULL) /* 4 GiB */
56
57/* This is the keep_free value when we can't determine the system
58 * size */
59#define DEFAULT_KEEP_FREE (1024ULL*1024ULL) /* 1 MB */
60
dca6219e
LP
61/* n_data was the first entry we added after the initial file format design */
62#define HEADER_SIZE_MIN ALIGN64(offsetof(Header, n_data))
cec736d2 63
cec736d2 64void journal_file_close(JournalFile *f) {
de190aef 65 assert(f);
cec736d2 66
feb12d3e 67#ifdef HAVE_GCRYPT
b0af6f41 68 /* Write the final tag */
c586dbf1 69 if (f->seal && f->writable)
b0af6f41 70 journal_file_append_tag(f);
feb12d3e 71#endif
b0af6f41 72
7560fffc 73 /* Sync everything to disk, before we mark the file offline */
16e9f408
LP
74 if (f->mmap && f->fd >= 0)
75 mmap_cache_close_fd(f->mmap, f->fd);
7560fffc
LP
76
77 if (f->writable && f->fd >= 0)
78 fdatasync(f->fd);
79
d384c7a8 80 if (f->header) {
cd96b3b8
LP
81 /* Mark the file offline. Don't override the archived state if it already is set */
82 if (f->writable && f->header->state == STATE_ONLINE)
d384c7a8 83 f->header->state = STATE_OFFLINE;
cec736d2 84
d384c7a8
MS
85 munmap(f->header, PAGE_ALIGN(sizeof(Header)));
86 }
cec736d2 87
0ac38b70
LP
88 if (f->fd >= 0)
89 close_nointr_nofail(f->fd);
90
cec736d2 91 free(f->path);
807e17f0 92
16e9f408
LP
93 if (f->mmap)
94 mmap_cache_unref(f->mmap);
95
807e17f0
LP
96#ifdef HAVE_XZ
97 free(f->compress_buffer);
98#endif
99
7560fffc 100#ifdef HAVE_GCRYPT
baed47c3
LP
101 if (f->fss_file)
102 munmap(f->fss_file, PAGE_ALIGN(f->fss_file_size));
b7c9ae91
LP
103 else if (f->fsprg_state)
104 free(f->fsprg_state);
105
106 free(f->fsprg_seed);
7560fffc
LP
107
108 if (f->hmac)
109 gcry_md_close(f->hmac);
110#endif
111
cec736d2
LP
112 free(f);
113}
114
0ac38b70 115static int journal_file_init_header(JournalFile *f, JournalFile *template) {
cec736d2
LP
116 Header h;
117 ssize_t k;
118 int r;
119
120 assert(f);
121
122 zero(h);
7560fffc 123 memcpy(h.signature, HEADER_SIGNATURE, 8);
23b0b2b2 124 h.header_size = htole64(ALIGN64(sizeof(h)));
cec736d2 125
7560fffc
LP
126 h.incompatible_flags =
127 htole32(f->compress ? HEADER_INCOMPATIBLE_COMPRESSED : 0);
128
129 h.compatible_flags =
baed47c3 130 htole32(f->seal ? HEADER_COMPATIBLE_SEALED : 0);
7560fffc 131
cec736d2
LP
132 r = sd_id128_randomize(&h.file_id);
133 if (r < 0)
134 return r;
135
0ac38b70
LP
136 if (template) {
137 h.seqnum_id = template->header->seqnum_id;
beec0085 138 h.tail_entry_seqnum = template->header->tail_entry_seqnum;
0ac38b70
LP
139 } else
140 h.seqnum_id = h.file_id;
cec736d2
LP
141
142 k = pwrite(f->fd, &h, sizeof(h), 0);
143 if (k < 0)
144 return -errno;
145
146 if (k != sizeof(h))
147 return -EIO;
148
149 return 0;
150}
151
152static int journal_file_refresh_header(JournalFile *f) {
153 int r;
de190aef 154 sd_id128_t boot_id;
cec736d2
LP
155
156 assert(f);
157
158 r = sd_id128_get_machine(&f->header->machine_id);
159 if (r < 0)
160 return r;
161
de190aef 162 r = sd_id128_get_boot(&boot_id);
cec736d2
LP
163 if (r < 0)
164 return r;
165
de190aef
LP
166 if (sd_id128_equal(boot_id, f->header->boot_id))
167 f->tail_entry_monotonic_valid = true;
168
169 f->header->boot_id = boot_id;
170
171 f->header->state = STATE_ONLINE;
b788cc23 172
7560fffc
LP
173 /* Sync the online state to disk */
174 msync(f->header, PAGE_ALIGN(sizeof(Header)), MS_SYNC);
175 fdatasync(f->fd);
b788cc23 176
cec736d2
LP
177 return 0;
178}
179
180static int journal_file_verify_header(JournalFile *f) {
181 assert(f);
182
7560fffc 183 if (memcmp(f->header->signature, HEADER_SIGNATURE, 8))
cec736d2
LP
184 return -EBADMSG;
185
7560fffc
LP
186 /* In both read and write mode we refuse to open files with
187 * incompatible flags we don't know */
807e17f0 188#ifdef HAVE_XZ
7560fffc 189 if ((le32toh(f->header->incompatible_flags) & ~HEADER_INCOMPATIBLE_COMPRESSED) != 0)
807e17f0
LP
190 return -EPROTONOSUPPORT;
191#else
cec736d2
LP
192 if (f->header->incompatible_flags != 0)
193 return -EPROTONOSUPPORT;
807e17f0 194#endif
cec736d2 195
7560fffc
LP
196 /* When open for writing we refuse to open files with
197 * compatible flags, too */
198 if (f->writable) {
199#ifdef HAVE_GCRYPT
baed47c3 200 if ((le32toh(f->header->compatible_flags) & ~HEADER_COMPATIBLE_SEALED) != 0)
7560fffc
LP
201 return -EPROTONOSUPPORT;
202#else
203 if (f->header->compatible_flags != 0)
204 return -EPROTONOSUPPORT;
205#endif
206 }
207
db11ac1a
LP
208 if (f->header->state >= _STATE_MAX)
209 return -EBADMSG;
210
dca6219e
LP
211 /* The first addition was n_data, so check that we are at least this large */
212 if (le64toh(f->header->header_size) < HEADER_SIZE_MIN)
23b0b2b2
LP
213 return -EBADMSG;
214
8088cbd3 215 if (JOURNAL_HEADER_SEALED(f->header) && !JOURNAL_HEADER_CONTAINS(f->header, n_entry_arrays))
beec0085
LP
216 return -EBADMSG;
217
db11ac1a
LP
218 if ((le64toh(f->header->header_size) + le64toh(f->header->arena_size)) > (uint64_t) f->last_stat.st_size)
219 return -ENODATA;
220
221 if (le64toh(f->header->tail_object_offset) > (le64toh(f->header->header_size) + le64toh(f->header->arena_size)))
222 return -ENODATA;
223
7762e02b
LP
224 if (!VALID64(le64toh(f->header->data_hash_table_offset)) ||
225 !VALID64(le64toh(f->header->field_hash_table_offset)) ||
226 !VALID64(le64toh(f->header->tail_object_offset)) ||
227 !VALID64(le64toh(f->header->entry_array_offset)))
228 return -ENODATA;
229
230 if (le64toh(f->header->data_hash_table_offset) < le64toh(f->header->header_size) ||
231 le64toh(f->header->field_hash_table_offset) < le64toh(f->header->header_size) ||
232 le64toh(f->header->tail_object_offset) < le64toh(f->header->header_size) ||
233 le64toh(f->header->entry_array_offset) < le64toh(f->header->header_size))
cec736d2
LP
234 return -ENODATA;
235
236 if (f->writable) {
ccdbaf91 237 uint8_t state;
cec736d2
LP
238 sd_id128_t machine_id;
239 int r;
240
241 r = sd_id128_get_machine(&machine_id);
242 if (r < 0)
243 return r;
244
245 if (!sd_id128_equal(machine_id, f->header->machine_id))
246 return -EHOSTDOWN;
247
de190aef 248 state = f->header->state;
cec736d2 249
71fa6f00
LP
250 if (state == STATE_ONLINE) {
251 log_debug("Journal file %s is already online. Assuming unclean closing.", f->path);
252 return -EBUSY;
253 } else if (state == STATE_ARCHIVED)
cec736d2 254 return -ESHUTDOWN;
71fa6f00
LP
255 else if (state != STATE_OFFLINE) {
256 log_debug("Journal file %s has unknown state %u.", f->path, state);
257 return -EBUSY;
258 }
cec736d2
LP
259 }
260
8088cbd3 261 f->compress = JOURNAL_HEADER_COMPRESSED(f->header);
c586dbf1 262
f1889c91 263 f->seal = JOURNAL_HEADER_SEALED(f->header);
7560fffc 264
cec736d2
LP
265 return 0;
266}
267
268static int journal_file_allocate(JournalFile *f, uint64_t offset, uint64_t size) {
cec736d2 269 uint64_t old_size, new_size;
fec2aa2f 270 int r;
cec736d2
LP
271
272 assert(f);
273
cec736d2 274 /* We assume that this file is not sparse, and we know that
38ac38b2 275 * for sure, since we always call posix_fallocate()
cec736d2
LP
276 * ourselves */
277
278 old_size =
23b0b2b2 279 le64toh(f->header->header_size) +
cec736d2
LP
280 le64toh(f->header->arena_size);
281
bc85bfee 282 new_size = PAGE_ALIGN(offset + size);
23b0b2b2
LP
283 if (new_size < le64toh(f->header->header_size))
284 new_size = le64toh(f->header->header_size);
bc85bfee
LP
285
286 if (new_size <= old_size)
cec736d2
LP
287 return 0;
288
bc85bfee
LP
289 if (f->metrics.max_size > 0 &&
290 new_size > f->metrics.max_size)
291 return -E2BIG;
cec736d2 292
bc85bfee
LP
293 if (new_size > f->metrics.min_size &&
294 f->metrics.keep_free > 0) {
cec736d2
LP
295 struct statvfs svfs;
296
297 if (fstatvfs(f->fd, &svfs) >= 0) {
298 uint64_t available;
299
300 available = svfs.f_bfree * svfs.f_bsize;
301
bc85bfee
LP
302 if (available >= f->metrics.keep_free)
303 available -= f->metrics.keep_free;
cec736d2
LP
304 else
305 available = 0;
306
307 if (new_size - old_size > available)
308 return -E2BIG;
309 }
310 }
311
bc85bfee
LP
312 /* Note that the glibc fallocate() fallback is very
313 inefficient, hence we try to minimize the allocation area
314 as we can. */
fec2aa2f
GV
315 r = posix_fallocate(f->fd, old_size, new_size - old_size);
316 if (r != 0)
317 return -r;
cec736d2
LP
318
319 if (fstat(f->fd, &f->last_stat) < 0)
320 return -errno;
321
23b0b2b2 322 f->header->arena_size = htole64(new_size - le64toh(f->header->header_size));
cec736d2
LP
323
324 return 0;
325}
326
fcde2389 327static int journal_file_move_to(JournalFile *f, int context, bool keep_always, uint64_t offset, uint64_t size, void **ret) {
cec736d2 328 assert(f);
cec736d2
LP
329 assert(ret);
330
7762e02b
LP
331 if (size <= 0)
332 return -EINVAL;
333
2a59ea54 334 /* Avoid SIGBUS on invalid accesses */
4bbdcdb3
LP
335 if (offset + size > (uint64_t) f->last_stat.st_size) {
336 /* Hmm, out of range? Let's refresh the fstat() data
337 * first, before we trust that check. */
338
339 if (fstat(f->fd, &f->last_stat) < 0 ||
340 offset + size > (uint64_t) f->last_stat.st_size)
341 return -EADDRNOTAVAIL;
342 }
343
fcde2389 344 return mmap_cache_get(f->mmap, f->fd, f->prot, context, keep_always, offset, size, &f->last_stat, ret);
cec736d2
LP
345}
346
16e9f408
LP
347static uint64_t minimum_header_size(Object *o) {
348
349 static uint64_t table[] = {
350 [OBJECT_DATA] = sizeof(DataObject),
351 [OBJECT_FIELD] = sizeof(FieldObject),
352 [OBJECT_ENTRY] = sizeof(EntryObject),
353 [OBJECT_DATA_HASH_TABLE] = sizeof(HashTableObject),
354 [OBJECT_FIELD_HASH_TABLE] = sizeof(HashTableObject),
355 [OBJECT_ENTRY_ARRAY] = sizeof(EntryArrayObject),
356 [OBJECT_TAG] = sizeof(TagObject),
357 };
358
359 if (o->object.type >= ELEMENTSOF(table) || table[o->object.type] <= 0)
360 return sizeof(ObjectHeader);
361
362 return table[o->object.type];
363}
364
de190aef 365int journal_file_move_to_object(JournalFile *f, int type, uint64_t offset, Object **ret) {
cec736d2
LP
366 int r;
367 void *t;
368 Object *o;
369 uint64_t s;
16e9f408 370 unsigned context;
cec736d2
LP
371
372 assert(f);
373 assert(ret);
374
db11ac1a
LP
375 /* Objects may only be located at multiple of 64 bit */
376 if (!VALID64(offset))
377 return -EFAULT;
378
16e9f408
LP
379 /* One context for each type, plus one catch-all for the rest */
380 context = type > 0 && type < _OBJECT_TYPE_MAX ? type : 0;
381
fcde2389 382 r = journal_file_move_to(f, context, false, offset, sizeof(ObjectHeader), &t);
cec736d2
LP
383 if (r < 0)
384 return r;
385
386 o = (Object*) t;
387 s = le64toh(o->object.size);
388
389 if (s < sizeof(ObjectHeader))
390 return -EBADMSG;
391
16e9f408
LP
392 if (o->object.type <= OBJECT_UNUSED)
393 return -EBADMSG;
394
395 if (s < minimum_header_size(o))
396 return -EBADMSG;
397
de190aef 398 if (type >= 0 && o->object.type != type)
cec736d2
LP
399 return -EBADMSG;
400
401 if (s > sizeof(ObjectHeader)) {
fcde2389 402 r = journal_file_move_to(f, o->object.type, false, offset, s, &t);
cec736d2
LP
403 if (r < 0)
404 return r;
405
406 o = (Object*) t;
407 }
408
cec736d2
LP
409 *ret = o;
410 return 0;
411}
412
d98cc1f2 413static uint64_t journal_file_entry_seqnum(JournalFile *f, uint64_t *seqnum) {
cec736d2
LP
414 uint64_t r;
415
416 assert(f);
417
beec0085 418 r = le64toh(f->header->tail_entry_seqnum) + 1;
c2373f84
LP
419
420 if (seqnum) {
de190aef 421 /* If an external seqnum counter was passed, we update
c2373f84
LP
422 * both the local and the external one, and set it to
423 * the maximum of both */
424
425 if (*seqnum + 1 > r)
426 r = *seqnum + 1;
427
428 *seqnum = r;
429 }
430
beec0085 431 f->header->tail_entry_seqnum = htole64(r);
cec736d2 432
beec0085
LP
433 if (f->header->head_entry_seqnum == 0)
434 f->header->head_entry_seqnum = htole64(r);
de190aef 435
cec736d2
LP
436 return r;
437}
438
0284adc6 439int journal_file_append_object(JournalFile *f, int type, uint64_t size, Object **ret, uint64_t *offset) {
cec736d2
LP
440 int r;
441 uint64_t p;
442 Object *tail, *o;
443 void *t;
444
445 assert(f);
16e9f408 446 assert(type > 0 && type < _OBJECT_TYPE_MAX);
cec736d2
LP
447 assert(size >= sizeof(ObjectHeader));
448 assert(offset);
449 assert(ret);
450
451 p = le64toh(f->header->tail_object_offset);
cec736d2 452 if (p == 0)
23b0b2b2 453 p = le64toh(f->header->header_size);
cec736d2 454 else {
de190aef 455 r = journal_file_move_to_object(f, -1, p, &tail);
cec736d2
LP
456 if (r < 0)
457 return r;
458
459 p += ALIGN64(le64toh(tail->object.size));
460 }
461
462 r = journal_file_allocate(f, p, size);
463 if (r < 0)
464 return r;
465
fcde2389 466 r = journal_file_move_to(f, type, false, p, size, &t);
cec736d2
LP
467 if (r < 0)
468 return r;
469
470 o = (Object*) t;
471
472 zero(o->object);
de190aef 473 o->object.type = type;
cec736d2
LP
474 o->object.size = htole64(size);
475
476 f->header->tail_object_offset = htole64(p);
cec736d2
LP
477 f->header->n_objects = htole64(le64toh(f->header->n_objects) + 1);
478
479 *ret = o;
480 *offset = p;
481
482 return 0;
483}
484
de190aef 485static int journal_file_setup_data_hash_table(JournalFile *f) {
cec736d2
LP
486 uint64_t s, p;
487 Object *o;
488 int r;
489
490 assert(f);
491
dfabe643 492 /* We estimate that we need 1 hash table entry per 768 of
4a92baf3
LP
493 journal file and we want to make sure we never get beyond
494 75% fill level. Calculate the hash table size for the
495 maximum file size based on these metrics. */
496
dfabe643 497 s = (f->metrics.max_size * 4 / 768 / 3) * sizeof(HashItem);
4a92baf3
LP
498 if (s < DEFAULT_DATA_HASH_TABLE_SIZE)
499 s = DEFAULT_DATA_HASH_TABLE_SIZE;
500
2b43f939 501 log_debug("Reserving %llu entries in hash table.", (unsigned long long) (s / sizeof(HashItem)));
4a92baf3 502
de190aef
LP
503 r = journal_file_append_object(f,
504 OBJECT_DATA_HASH_TABLE,
505 offsetof(Object, hash_table.items) + s,
506 &o, &p);
cec736d2
LP
507 if (r < 0)
508 return r;
509
de190aef 510 memset(o->hash_table.items, 0, s);
cec736d2 511
de190aef
LP
512 f->header->data_hash_table_offset = htole64(p + offsetof(Object, hash_table.items));
513 f->header->data_hash_table_size = htole64(s);
cec736d2
LP
514
515 return 0;
516}
517
de190aef 518static int journal_file_setup_field_hash_table(JournalFile *f) {
cec736d2
LP
519 uint64_t s, p;
520 Object *o;
521 int r;
522
523 assert(f);
524
de190aef
LP
525 s = DEFAULT_FIELD_HASH_TABLE_SIZE;
526 r = journal_file_append_object(f,
527 OBJECT_FIELD_HASH_TABLE,
528 offsetof(Object, hash_table.items) + s,
529 &o, &p);
cec736d2
LP
530 if (r < 0)
531 return r;
532
de190aef 533 memset(o->hash_table.items, 0, s);
cec736d2 534
de190aef
LP
535 f->header->field_hash_table_offset = htole64(p + offsetof(Object, hash_table.items));
536 f->header->field_hash_table_size = htole64(s);
cec736d2
LP
537
538 return 0;
539}
540
de190aef 541static int journal_file_map_data_hash_table(JournalFile *f) {
cec736d2
LP
542 uint64_t s, p;
543 void *t;
544 int r;
545
546 assert(f);
547
de190aef
LP
548 p = le64toh(f->header->data_hash_table_offset);
549 s = le64toh(f->header->data_hash_table_size);
cec736d2 550
de190aef 551 r = journal_file_move_to(f,
16e9f408 552 OBJECT_DATA_HASH_TABLE,
fcde2389 553 true,
de190aef
LP
554 p, s,
555 &t);
cec736d2
LP
556 if (r < 0)
557 return r;
558
de190aef 559 f->data_hash_table = t;
cec736d2
LP
560 return 0;
561}
562
de190aef 563static int journal_file_map_field_hash_table(JournalFile *f) {
cec736d2
LP
564 uint64_t s, p;
565 void *t;
566 int r;
567
568 assert(f);
569
de190aef
LP
570 p = le64toh(f->header->field_hash_table_offset);
571 s = le64toh(f->header->field_hash_table_size);
cec736d2 572
de190aef 573 r = journal_file_move_to(f,
16e9f408 574 OBJECT_FIELD_HASH_TABLE,
fcde2389 575 true,
de190aef
LP
576 p, s,
577 &t);
cec736d2
LP
578 if (r < 0)
579 return r;
580
de190aef 581 f->field_hash_table = t;
cec736d2
LP
582 return 0;
583}
584
de190aef
LP
585static int journal_file_link_data(JournalFile *f, Object *o, uint64_t offset, uint64_t hash) {
586 uint64_t p, h;
cec736d2
LP
587 int r;
588
589 assert(f);
590 assert(o);
591 assert(offset > 0);
b588975f
LP
592
593 if (o->object.type != OBJECT_DATA)
594 return -EINVAL;
cec736d2 595
48496df6
LP
596 /* This might alter the window we are looking at */
597
de190aef
LP
598 o->data.next_hash_offset = o->data.next_field_offset = 0;
599 o->data.entry_offset = o->data.entry_array_offset = 0;
600 o->data.n_entries = 0;
cec736d2 601
de190aef 602 h = hash % (le64toh(f->header->data_hash_table_size) / sizeof(HashItem));
8db4213e 603 p = le64toh(f->data_hash_table[h].tail_hash_offset);
cec736d2
LP
604 if (p == 0) {
605 /* Only entry in the hash table is easy */
de190aef 606 f->data_hash_table[h].head_hash_offset = htole64(offset);
cec736d2 607 } else {
48496df6
LP
608 /* Move back to the previous data object, to patch in
609 * pointer */
cec736d2 610
de190aef 611 r = journal_file_move_to_object(f, OBJECT_DATA, p, &o);
cec736d2
LP
612 if (r < 0)
613 return r;
614
de190aef 615 o->data.next_hash_offset = htole64(offset);
cec736d2
LP
616 }
617
de190aef 618 f->data_hash_table[h].tail_hash_offset = htole64(offset);
cec736d2 619
dca6219e
LP
620 if (JOURNAL_HEADER_CONTAINS(f->header, n_data))
621 f->header->n_data = htole64(le64toh(f->header->n_data) + 1);
622
cec736d2
LP
623 return 0;
624}
625
de190aef
LP
626int journal_file_find_data_object_with_hash(
627 JournalFile *f,
628 const void *data, uint64_t size, uint64_t hash,
629 Object **ret, uint64_t *offset) {
48496df6 630
de190aef 631 uint64_t p, osize, h;
cec736d2
LP
632 int r;
633
634 assert(f);
635 assert(data || size == 0);
636
637 osize = offsetof(Object, data.payload) + size;
638
bc85bfee
LP
639 if (f->header->data_hash_table_size == 0)
640 return -EBADMSG;
641
de190aef
LP
642 h = hash % (le64toh(f->header->data_hash_table_size) / sizeof(HashItem));
643 p = le64toh(f->data_hash_table[h].head_hash_offset);
cec736d2 644
de190aef
LP
645 while (p > 0) {
646 Object *o;
cec736d2 647
de190aef 648 r = journal_file_move_to_object(f, OBJECT_DATA, p, &o);
cec736d2
LP
649 if (r < 0)
650 return r;
651
807e17f0 652 if (le64toh(o->data.hash) != hash)
85a131e8 653 goto next;
807e17f0
LP
654
655 if (o->object.flags & OBJECT_COMPRESSED) {
656#ifdef HAVE_XZ
b785c858 657 uint64_t l, rsize;
cec736d2 658
807e17f0
LP
659 l = le64toh(o->object.size);
660 if (l <= offsetof(Object, data.payload))
cec736d2
LP
661 return -EBADMSG;
662
807e17f0
LP
663 l -= offsetof(Object, data.payload);
664
665 if (!uncompress_blob(o->data.payload, l, &f->compress_buffer, &f->compress_buffer_size, &rsize))
666 return -EBADMSG;
667
b785c858 668 if (rsize == size &&
807e17f0
LP
669 memcmp(f->compress_buffer, data, size) == 0) {
670
671 if (ret)
672 *ret = o;
673
674 if (offset)
675 *offset = p;
676
677 return 1;
678 }
679#else
680 return -EPROTONOSUPPORT;
681#endif
682
683 } else if (le64toh(o->object.size) == osize &&
684 memcmp(o->data.payload, data, size) == 0) {
685
cec736d2
LP
686 if (ret)
687 *ret = o;
688
689 if (offset)
690 *offset = p;
691
de190aef 692 return 1;
cec736d2
LP
693 }
694
85a131e8 695 next:
cec736d2
LP
696 p = le64toh(o->data.next_hash_offset);
697 }
698
de190aef
LP
699 return 0;
700}
701
702int journal_file_find_data_object(
703 JournalFile *f,
704 const void *data, uint64_t size,
705 Object **ret, uint64_t *offset) {
706
707 uint64_t hash;
708
709 assert(f);
710 assert(data || size == 0);
711
712 hash = hash64(data, size);
713
714 return journal_file_find_data_object_with_hash(f,
715 data, size, hash,
716 ret, offset);
717}
718
48496df6
LP
719static int journal_file_append_data(
720 JournalFile *f,
721 const void *data, uint64_t size,
722 Object **ret, uint64_t *offset) {
723
de190aef
LP
724 uint64_t hash, p;
725 uint64_t osize;
726 Object *o;
727 int r;
807e17f0 728 bool compressed = false;
de190aef
LP
729
730 assert(f);
731 assert(data || size == 0);
732
733 hash = hash64(data, size);
734
735 r = journal_file_find_data_object_with_hash(f, data, size, hash, &o, &p);
736 if (r < 0)
737 return r;
738 else if (r > 0) {
739
740 if (ret)
741 *ret = o;
742
743 if (offset)
744 *offset = p;
745
746 return 0;
747 }
748
749 osize = offsetof(Object, data.payload) + size;
750 r = journal_file_append_object(f, OBJECT_DATA, osize, &o, &p);
cec736d2
LP
751 if (r < 0)
752 return r;
753
cec736d2 754 o->data.hash = htole64(hash);
807e17f0
LP
755
756#ifdef HAVE_XZ
757 if (f->compress &&
758 size >= COMPRESSION_SIZE_THRESHOLD) {
759 uint64_t rsize;
760
761 compressed = compress_blob(data, size, o->data.payload, &rsize);
762
763 if (compressed) {
764 o->object.size = htole64(offsetof(Object, data.payload) + rsize);
765 o->object.flags |= OBJECT_COMPRESSED;
766
807e17f0
LP
767 log_debug("Compressed data object %lu -> %lu", (unsigned long) size, (unsigned long) rsize);
768 }
769 }
770#endif
771
64825d3c 772 if (!compressed && size > 0)
807e17f0 773 memcpy(o->data.payload, data, size);
cec736d2 774
de190aef 775 r = journal_file_link_data(f, o, p, hash);
cec736d2
LP
776 if (r < 0)
777 return r;
778
48496df6
LP
779 /* The linking might have altered the window, so let's
780 * refresh our pointer */
781 r = journal_file_move_to_object(f, OBJECT_DATA, p, &o);
782 if (r < 0)
783 return r;
784
5996c7c2
LP
785#ifdef HAVE_GCRYPT
786 r = journal_file_hmac_put_object(f, OBJECT_DATA, o, p);
787 if (r < 0)
788 return r;
789#endif
790
cec736d2
LP
791 if (ret)
792 *ret = o;
793
794 if (offset)
de190aef 795 *offset = p;
cec736d2
LP
796
797 return 0;
798}
799
800uint64_t journal_file_entry_n_items(Object *o) {
801 assert(o);
b588975f
LP
802
803 if (o->object.type != OBJECT_ENTRY)
804 return 0;
cec736d2
LP
805
806 return (le64toh(o->object.size) - offsetof(Object, entry.items)) / sizeof(EntryItem);
807}
808
0284adc6 809uint64_t journal_file_entry_array_n_items(Object *o) {
de190aef 810 assert(o);
b588975f
LP
811
812 if (o->object.type != OBJECT_ENTRY_ARRAY)
813 return 0;
de190aef
LP
814
815 return (le64toh(o->object.size) - offsetof(Object, entry_array.items)) / sizeof(uint64_t);
816}
817
fb9a24b6
LP
818uint64_t journal_file_hash_table_n_items(Object *o) {
819 assert(o);
b588975f
LP
820
821 if (o->object.type != OBJECT_DATA_HASH_TABLE &&
822 o->object.type != OBJECT_FIELD_HASH_TABLE)
823 return 0;
fb9a24b6
LP
824
825 return (le64toh(o->object.size) - offsetof(Object, hash_table.items)) / sizeof(HashItem);
826}
827
de190aef 828static int link_entry_into_array(JournalFile *f,
4fd052ae
FC
829 le64_t *first,
830 le64_t *idx,
de190aef 831 uint64_t p) {
cec736d2 832 int r;
de190aef
LP
833 uint64_t n = 0, ap = 0, q, i, a, hidx;
834 Object *o;
835
cec736d2 836 assert(f);
de190aef
LP
837 assert(first);
838 assert(idx);
839 assert(p > 0);
cec736d2 840
de190aef
LP
841 a = le64toh(*first);
842 i = hidx = le64toh(*idx);
843 while (a > 0) {
844
845 r = journal_file_move_to_object(f, OBJECT_ENTRY_ARRAY, a, &o);
846 if (r < 0)
847 return r;
cec736d2 848
de190aef
LP
849 n = journal_file_entry_array_n_items(o);
850 if (i < n) {
851 o->entry_array.items[i] = htole64(p);
852 *idx = htole64(hidx + 1);
853 return 0;
854 }
cec736d2 855
de190aef
LP
856 i -= n;
857 ap = a;
858 a = le64toh(o->entry_array.next_entry_array_offset);
859 }
860
861 if (hidx > n)
862 n = (hidx+1) * 2;
863 else
864 n = n * 2;
865
866 if (n < 4)
867 n = 4;
868
869 r = journal_file_append_object(f, OBJECT_ENTRY_ARRAY,
870 offsetof(Object, entry_array.items) + n * sizeof(uint64_t),
871 &o, &q);
cec736d2
LP
872 if (r < 0)
873 return r;
874
feb12d3e 875#ifdef HAVE_GCRYPT
5996c7c2 876 r = journal_file_hmac_put_object(f, OBJECT_ENTRY_ARRAY, o, q);
b0af6f41
LP
877 if (r < 0)
878 return r;
feb12d3e 879#endif
b0af6f41 880
de190aef 881 o->entry_array.items[i] = htole64(p);
cec736d2 882
de190aef 883 if (ap == 0)
7be3aa17 884 *first = htole64(q);
cec736d2 885 else {
de190aef 886 r = journal_file_move_to_object(f, OBJECT_ENTRY_ARRAY, ap, &o);
cec736d2
LP
887 if (r < 0)
888 return r;
889
de190aef
LP
890 o->entry_array.next_entry_array_offset = htole64(q);
891 }
cec736d2 892
2dee23eb
LP
893 if (JOURNAL_HEADER_CONTAINS(f->header, n_entry_arrays))
894 f->header->n_entry_arrays = htole64(le64toh(f->header->n_entry_arrays) + 1);
895
de190aef
LP
896 *idx = htole64(hidx + 1);
897
898 return 0;
899}
cec736d2 900
de190aef 901static int link_entry_into_array_plus_one(JournalFile *f,
4fd052ae
FC
902 le64_t *extra,
903 le64_t *first,
904 le64_t *idx,
de190aef
LP
905 uint64_t p) {
906
907 int r;
908
909 assert(f);
910 assert(extra);
911 assert(first);
912 assert(idx);
913 assert(p > 0);
914
915 if (*idx == 0)
916 *extra = htole64(p);
917 else {
4fd052ae 918 le64_t i;
de190aef 919
7be3aa17 920 i = htole64(le64toh(*idx) - 1);
de190aef
LP
921 r = link_entry_into_array(f, first, &i, p);
922 if (r < 0)
923 return r;
cec736d2
LP
924 }
925
de190aef
LP
926 *idx = htole64(le64toh(*idx) + 1);
927 return 0;
928}
929
930static int journal_file_link_entry_item(JournalFile *f, Object *o, uint64_t offset, uint64_t i) {
931 uint64_t p;
932 int r;
933 assert(f);
934 assert(o);
935 assert(offset > 0);
936
937 p = le64toh(o->entry.items[i].object_offset);
938 if (p == 0)
939 return -EINVAL;
940
941 r = journal_file_move_to_object(f, OBJECT_DATA, p, &o);
cec736d2
LP
942 if (r < 0)
943 return r;
944
de190aef
LP
945 return link_entry_into_array_plus_one(f,
946 &o->data.entry_offset,
947 &o->data.entry_array_offset,
948 &o->data.n_entries,
949 offset);
cec736d2
LP
950}
951
952static int journal_file_link_entry(JournalFile *f, Object *o, uint64_t offset) {
de190aef 953 uint64_t n, i;
cec736d2
LP
954 int r;
955
956 assert(f);
957 assert(o);
958 assert(offset > 0);
b588975f
LP
959
960 if (o->object.type != OBJECT_ENTRY)
961 return -EINVAL;
cec736d2 962
b788cc23
LP
963 __sync_synchronize();
964
cec736d2 965 /* Link up the entry itself */
de190aef
LP
966 r = link_entry_into_array(f,
967 &f->header->entry_array_offset,
968 &f->header->n_entries,
969 offset);
970 if (r < 0)
971 return r;
cec736d2 972
aaf53376 973 /* log_debug("=> %s seqnr=%lu n_entries=%lu", f->path, (unsigned long) o->entry.seqnum, (unsigned long) f->header->n_entries); */
cec736d2 974
de190aef 975 if (f->header->head_entry_realtime == 0)
0ac38b70 976 f->header->head_entry_realtime = o->entry.realtime;
cec736d2 977
0ac38b70 978 f->header->tail_entry_realtime = o->entry.realtime;
de190aef
LP
979 f->header->tail_entry_monotonic = o->entry.monotonic;
980
981 f->tail_entry_monotonic_valid = true;
cec736d2
LP
982
983 /* Link up the items */
984 n = journal_file_entry_n_items(o);
985 for (i = 0; i < n; i++) {
986 r = journal_file_link_entry_item(f, o, offset, i);
987 if (r < 0)
988 return r;
989 }
990
cec736d2
LP
991 return 0;
992}
993
994static int journal_file_append_entry_internal(
995 JournalFile *f,
996 const dual_timestamp *ts,
997 uint64_t xor_hash,
998 const EntryItem items[], unsigned n_items,
de190aef 999 uint64_t *seqnum,
cec736d2
LP
1000 Object **ret, uint64_t *offset) {
1001 uint64_t np;
1002 uint64_t osize;
1003 Object *o;
1004 int r;
1005
1006 assert(f);
1007 assert(items || n_items == 0);
de190aef 1008 assert(ts);
cec736d2
LP
1009
1010 osize = offsetof(Object, entry.items) + (n_items * sizeof(EntryItem));
1011
de190aef 1012 r = journal_file_append_object(f, OBJECT_ENTRY, osize, &o, &np);
cec736d2
LP
1013 if (r < 0)
1014 return r;
1015
d98cc1f2 1016 o->entry.seqnum = htole64(journal_file_entry_seqnum(f, seqnum));
cec736d2 1017 memcpy(o->entry.items, items, n_items * sizeof(EntryItem));
de190aef
LP
1018 o->entry.realtime = htole64(ts->realtime);
1019 o->entry.monotonic = htole64(ts->monotonic);
cec736d2
LP
1020 o->entry.xor_hash = htole64(xor_hash);
1021 o->entry.boot_id = f->header->boot_id;
1022
feb12d3e 1023#ifdef HAVE_GCRYPT
5996c7c2 1024 r = journal_file_hmac_put_object(f, OBJECT_ENTRY, o, np);
b0af6f41
LP
1025 if (r < 0)
1026 return r;
feb12d3e 1027#endif
b0af6f41 1028
cec736d2
LP
1029 r = journal_file_link_entry(f, o, np);
1030 if (r < 0)
1031 return r;
1032
1033 if (ret)
1034 *ret = o;
1035
1036 if (offset)
1037 *offset = np;
1038
1039 return 0;
1040}
1041
cf244689 1042void journal_file_post_change(JournalFile *f) {
50f20cfd
LP
1043 assert(f);
1044
1045 /* inotify() does not receive IN_MODIFY events from file
1046 * accesses done via mmap(). After each access we hence
1047 * trigger IN_MODIFY by truncating the journal file to its
1048 * current size which triggers IN_MODIFY. */
1049
bc85bfee
LP
1050 __sync_synchronize();
1051
50f20cfd 1052 if (ftruncate(f->fd, f->last_stat.st_size) < 0)
c5315881 1053 log_error("Failed to truncate file to its own size: %m");
50f20cfd
LP
1054}
1055
1f2da9ec
LP
1056static int entry_item_cmp(const void *_a, const void *_b) {
1057 const EntryItem *a = _a, *b = _b;
1058
1059 if (le64toh(a->object_offset) < le64toh(b->object_offset))
1060 return -1;
1061 if (le64toh(a->object_offset) > le64toh(b->object_offset))
1062 return 1;
1063 return 0;
1064}
1065
de190aef 1066int journal_file_append_entry(JournalFile *f, const dual_timestamp *ts, const struct iovec iovec[], unsigned n_iovec, uint64_t *seqnum, Object **ret, uint64_t *offset) {
cec736d2
LP
1067 unsigned i;
1068 EntryItem *items;
1069 int r;
1070 uint64_t xor_hash = 0;
de190aef 1071 struct dual_timestamp _ts;
cec736d2
LP
1072
1073 assert(f);
1074 assert(iovec || n_iovec == 0);
1075
de190aef
LP
1076 if (!f->writable)
1077 return -EPERM;
1078
1079 if (!ts) {
1080 dual_timestamp_get(&_ts);
1081 ts = &_ts;
1082 }
1083
1084 if (f->tail_entry_monotonic_valid &&
1085 ts->monotonic < le64toh(f->header->tail_entry_monotonic))
1086 return -EINVAL;
1087
feb12d3e 1088#ifdef HAVE_GCRYPT
7560fffc
LP
1089 r = journal_file_maybe_append_tag(f, ts->realtime);
1090 if (r < 0)
1091 return r;
feb12d3e 1092#endif
7560fffc 1093
64825d3c
LP
1094 /* alloca() can't take 0, hence let's allocate at least one */
1095 items = alloca(sizeof(EntryItem) * MAX(1, n_iovec));
cec736d2
LP
1096
1097 for (i = 0; i < n_iovec; i++) {
1098 uint64_t p;
1099 Object *o;
1100
1101 r = journal_file_append_data(f, iovec[i].iov_base, iovec[i].iov_len, &o, &p);
1102 if (r < 0)
cf244689 1103 return r;
cec736d2
LP
1104
1105 xor_hash ^= le64toh(o->data.hash);
1106 items[i].object_offset = htole64(p);
de7b95cd 1107 items[i].hash = o->data.hash;
cec736d2
LP
1108 }
1109
1f2da9ec
LP
1110 /* Order by the position on disk, in order to improve seek
1111 * times for rotating media. */
1112 qsort(items, n_iovec, sizeof(EntryItem), entry_item_cmp);
1113
de190aef 1114 r = journal_file_append_entry_internal(f, ts, xor_hash, items, n_iovec, seqnum, ret, offset);
cec736d2 1115
50f20cfd
LP
1116 journal_file_post_change(f);
1117
cec736d2
LP
1118 return r;
1119}
1120
de190aef
LP
1121static int generic_array_get(JournalFile *f,
1122 uint64_t first,
1123 uint64_t i,
1124 Object **ret, uint64_t *offset) {
1125
cec736d2 1126 Object *o;
6c8a39b8 1127 uint64_t p = 0, a;
cec736d2
LP
1128 int r;
1129
1130 assert(f);
1131
de190aef
LP
1132 a = first;
1133 while (a > 0) {
1134 uint64_t n;
cec736d2 1135
de190aef
LP
1136 r = journal_file_move_to_object(f, OBJECT_ENTRY_ARRAY, a, &o);
1137 if (r < 0)
1138 return r;
cec736d2 1139
de190aef
LP
1140 n = journal_file_entry_array_n_items(o);
1141 if (i < n) {
1142 p = le64toh(o->entry_array.items[i]);
1143 break;
cec736d2
LP
1144 }
1145
de190aef
LP
1146 i -= n;
1147 a = le64toh(o->entry_array.next_entry_array_offset);
1148 }
1149
1150 if (a <= 0 || p <= 0)
1151 return 0;
1152
1153 r = journal_file_move_to_object(f, OBJECT_ENTRY, p, &o);
1154 if (r < 0)
1155 return r;
1156
1157 if (ret)
1158 *ret = o;
1159
1160 if (offset)
1161 *offset = p;
1162
1163 return 1;
1164}
1165
1166static int generic_array_get_plus_one(JournalFile *f,
1167 uint64_t extra,
1168 uint64_t first,
1169 uint64_t i,
1170 Object **ret, uint64_t *offset) {
1171
1172 Object *o;
1173
1174 assert(f);
1175
1176 if (i == 0) {
1177 int r;
1178
1179 r = journal_file_move_to_object(f, OBJECT_ENTRY, extra, &o);
cec736d2
LP
1180 if (r < 0)
1181 return r;
1182
de190aef
LP
1183 if (ret)
1184 *ret = o;
cec736d2 1185
de190aef
LP
1186 if (offset)
1187 *offset = extra;
cec736d2 1188
de190aef 1189 return 1;
cec736d2
LP
1190 }
1191
de190aef
LP
1192 return generic_array_get(f, first, i-1, ret, offset);
1193}
cec736d2 1194
de190aef
LP
1195enum {
1196 TEST_FOUND,
1197 TEST_LEFT,
1198 TEST_RIGHT
1199};
cec736d2 1200
de190aef
LP
1201static int generic_array_bisect(JournalFile *f,
1202 uint64_t first,
1203 uint64_t n,
1204 uint64_t needle,
1205 int (*test_object)(JournalFile *f, uint64_t p, uint64_t needle),
1206 direction_t direction,
1207 Object **ret,
1208 uint64_t *offset,
1209 uint64_t *idx) {
1210
1211 uint64_t a, p, t = 0, i = 0, last_p = 0;
1212 bool subtract_one = false;
1213 Object *o, *array = NULL;
1214 int r;
cec736d2 1215
de190aef
LP
1216 assert(f);
1217 assert(test_object);
cec736d2 1218
de190aef
LP
1219 a = first;
1220 while (a > 0) {
1221 uint64_t left, right, k, lp;
1222
1223 r = journal_file_move_to_object(f, OBJECT_ENTRY_ARRAY, a, &array);
cec736d2
LP
1224 if (r < 0)
1225 return r;
1226
de190aef
LP
1227 k = journal_file_entry_array_n_items(array);
1228 right = MIN(k, n);
1229 if (right <= 0)
1230 return 0;
cec736d2 1231
de190aef
LP
1232 i = right - 1;
1233 lp = p = le64toh(array->entry_array.items[i]);
1234 if (p <= 0)
1235 return -EBADMSG;
cec736d2 1236
de190aef
LP
1237 r = test_object(f, p, needle);
1238 if (r < 0)
1239 return r;
cec736d2 1240
de190aef
LP
1241 if (r == TEST_FOUND)
1242 r = direction == DIRECTION_DOWN ? TEST_RIGHT : TEST_LEFT;
1243
1244 if (r == TEST_RIGHT) {
1245 left = 0;
1246 right -= 1;
1247 for (;;) {
1248 if (left == right) {
1249 if (direction == DIRECTION_UP)
1250 subtract_one = true;
1251
1252 i = left;
1253 goto found;
1254 }
1255
1256 assert(left < right);
1257
1258 i = (left + right) / 2;
1259 p = le64toh(array->entry_array.items[i]);
1260 if (p <= 0)
1261 return -EBADMSG;
1262
1263 r = test_object(f, p, needle);
1264 if (r < 0)
1265 return r;
cec736d2 1266
de190aef
LP
1267 if (r == TEST_FOUND)
1268 r = direction == DIRECTION_DOWN ? TEST_RIGHT : TEST_LEFT;
1269
1270 if (r == TEST_RIGHT)
1271 right = i;
1272 else
1273 left = i + 1;
1274 }
1275 }
1276
cbdca852
LP
1277 if (k > n) {
1278 if (direction == DIRECTION_UP) {
1279 i = n;
1280 subtract_one = true;
1281 goto found;
1282 }
1283
cec736d2 1284 return 0;
cbdca852 1285 }
cec736d2 1286
de190aef
LP
1287 last_p = lp;
1288
1289 n -= k;
1290 t += k;
1291 a = le64toh(array->entry_array.next_entry_array_offset);
cec736d2
LP
1292 }
1293
1294 return 0;
de190aef
LP
1295
1296found:
1297 if (subtract_one && t == 0 && i == 0)
1298 return 0;
1299
1300 if (subtract_one && i == 0)
1301 p = last_p;
1302 else if (subtract_one)
1303 p = le64toh(array->entry_array.items[i-1]);
1304 else
1305 p = le64toh(array->entry_array.items[i]);
1306
1307 r = journal_file_move_to_object(f, OBJECT_ENTRY, p, &o);
1308 if (r < 0)
1309 return r;
1310
1311 if (ret)
1312 *ret = o;
1313
1314 if (offset)
1315 *offset = p;
1316
1317 if (idx)
cbdca852 1318 *idx = t + i + (subtract_one ? -1 : 0);
de190aef
LP
1319
1320 return 1;
cec736d2
LP
1321}
1322
de190aef
LP
1323static int generic_array_bisect_plus_one(JournalFile *f,
1324 uint64_t extra,
1325 uint64_t first,
1326 uint64_t n,
1327 uint64_t needle,
1328 int (*test_object)(JournalFile *f, uint64_t p, uint64_t needle),
1329 direction_t direction,
1330 Object **ret,
1331 uint64_t *offset,
1332 uint64_t *idx) {
1333
cec736d2 1334 int r;
cbdca852
LP
1335 bool step_back = false;
1336 Object *o;
cec736d2
LP
1337
1338 assert(f);
de190aef 1339 assert(test_object);
cec736d2 1340
de190aef
LP
1341 if (n <= 0)
1342 return 0;
cec736d2 1343
de190aef
LP
1344 /* This bisects the array in object 'first', but first checks
1345 * an extra */
de190aef
LP
1346 r = test_object(f, extra, needle);
1347 if (r < 0)
1348 return r;
a536e261
LP
1349
1350 if (r == TEST_FOUND)
1351 r = direction == DIRECTION_DOWN ? TEST_RIGHT : TEST_LEFT;
1352
cbdca852
LP
1353 /* if we are looking with DIRECTION_UP then we need to first
1354 see if in the actual array there is a matching entry, and
1355 return the last one of that. But if there isn't any we need
1356 to return this one. Hence remember this, and return it
1357 below. */
1358 if (r == TEST_LEFT)
1359 step_back = direction == DIRECTION_UP;
de190aef 1360
cbdca852
LP
1361 if (r == TEST_RIGHT) {
1362 if (direction == DIRECTION_DOWN)
1363 goto found;
1364 else
1365 return 0;
a536e261 1366 }
cec736d2 1367
de190aef
LP
1368 r = generic_array_bisect(f, first, n-1, needle, test_object, direction, ret, offset, idx);
1369
cbdca852
LP
1370 if (r == 0 && step_back)
1371 goto found;
1372
ecf68b1d 1373 if (r > 0 && idx)
de190aef
LP
1374 (*idx) ++;
1375
1376 return r;
cbdca852
LP
1377
1378found:
1379 r = journal_file_move_to_object(f, OBJECT_ENTRY, extra, &o);
1380 if (r < 0)
1381 return r;
1382
1383 if (ret)
1384 *ret = o;
1385
1386 if (offset)
1387 *offset = extra;
1388
1389 if (idx)
1390 *idx = 0;
1391
1392 return 1;
1393}
1394
1395static int test_object_offset(JournalFile *f, uint64_t p, uint64_t needle) {
1396 assert(f);
1397 assert(p > 0);
1398
1399 if (p == needle)
1400 return TEST_FOUND;
1401 else if (p < needle)
1402 return TEST_LEFT;
1403 else
1404 return TEST_RIGHT;
1405}
1406
1407int journal_file_move_to_entry_by_offset(
1408 JournalFile *f,
1409 uint64_t p,
1410 direction_t direction,
1411 Object **ret,
1412 uint64_t *offset) {
1413
1414 return generic_array_bisect(f,
1415 le64toh(f->header->entry_array_offset),
1416 le64toh(f->header->n_entries),
1417 p,
1418 test_object_offset,
1419 direction,
1420 ret, offset, NULL);
de190aef
LP
1421}
1422
cbdca852 1423
de190aef
LP
1424static int test_object_seqnum(JournalFile *f, uint64_t p, uint64_t needle) {
1425 Object *o;
1426 int r;
1427
1428 assert(f);
1429 assert(p > 0);
1430
1431 r = journal_file_move_to_object(f, OBJECT_ENTRY, p, &o);
cec736d2
LP
1432 if (r < 0)
1433 return r;
1434
de190aef
LP
1435 if (le64toh(o->entry.seqnum) == needle)
1436 return TEST_FOUND;
1437 else if (le64toh(o->entry.seqnum) < needle)
1438 return TEST_LEFT;
1439 else
1440 return TEST_RIGHT;
1441}
cec736d2 1442
de190aef
LP
1443int journal_file_move_to_entry_by_seqnum(
1444 JournalFile *f,
1445 uint64_t seqnum,
1446 direction_t direction,
1447 Object **ret,
1448 uint64_t *offset) {
1449
1450 return generic_array_bisect(f,
1451 le64toh(f->header->entry_array_offset),
1452 le64toh(f->header->n_entries),
1453 seqnum,
1454 test_object_seqnum,
1455 direction,
1456 ret, offset, NULL);
1457}
cec736d2 1458
de190aef
LP
1459static int test_object_realtime(JournalFile *f, uint64_t p, uint64_t needle) {
1460 Object *o;
1461 int r;
1462
1463 assert(f);
1464 assert(p > 0);
1465
1466 r = journal_file_move_to_object(f, OBJECT_ENTRY, p, &o);
1467 if (r < 0)
1468 return r;
1469
1470 if (le64toh(o->entry.realtime) == needle)
1471 return TEST_FOUND;
1472 else if (le64toh(o->entry.realtime) < needle)
1473 return TEST_LEFT;
1474 else
1475 return TEST_RIGHT;
cec736d2
LP
1476}
1477
de190aef
LP
1478int journal_file_move_to_entry_by_realtime(
1479 JournalFile *f,
1480 uint64_t realtime,
1481 direction_t direction,
1482 Object **ret,
1483 uint64_t *offset) {
1484
1485 return generic_array_bisect(f,
1486 le64toh(f->header->entry_array_offset),
1487 le64toh(f->header->n_entries),
1488 realtime,
1489 test_object_realtime,
1490 direction,
1491 ret, offset, NULL);
1492}
1493
1494static int test_object_monotonic(JournalFile *f, uint64_t p, uint64_t needle) {
1495 Object *o;
1496 int r;
1497
1498 assert(f);
1499 assert(p > 0);
1500
1501 r = journal_file_move_to_object(f, OBJECT_ENTRY, p, &o);
1502 if (r < 0)
1503 return r;
1504
1505 if (le64toh(o->entry.monotonic) == needle)
1506 return TEST_FOUND;
1507 else if (le64toh(o->entry.monotonic) < needle)
1508 return TEST_LEFT;
1509 else
1510 return TEST_RIGHT;
1511}
1512
1513int journal_file_move_to_entry_by_monotonic(
1514 JournalFile *f,
1515 sd_id128_t boot_id,
1516 uint64_t monotonic,
1517 direction_t direction,
1518 Object **ret,
1519 uint64_t *offset) {
1520
10b6f904 1521 char t[9+32+1] = "_BOOT_ID=";
de190aef
LP
1522 Object *o;
1523 int r;
1524
cbdca852 1525 assert(f);
de190aef 1526
cbdca852 1527 sd_id128_to_string(boot_id, t + 9);
de190aef
LP
1528 r = journal_file_find_data_object(f, t, strlen(t), &o, NULL);
1529 if (r < 0)
1530 return r;
cbdca852 1531 if (r == 0)
de190aef
LP
1532 return -ENOENT;
1533
1534 return generic_array_bisect_plus_one(f,
1535 le64toh(o->data.entry_offset),
1536 le64toh(o->data.entry_array_offset),
1537 le64toh(o->data.n_entries),
1538 monotonic,
1539 test_object_monotonic,
1540 direction,
1541 ret, offset, NULL);
1542}
1543
de190aef
LP
1544int journal_file_next_entry(
1545 JournalFile *f,
1546 Object *o, uint64_t p,
1547 direction_t direction,
1548 Object **ret, uint64_t *offset) {
1549
1550 uint64_t i, n;
cec736d2
LP
1551 int r;
1552
1553 assert(f);
de190aef
LP
1554 assert(p > 0 || !o);
1555
1556 n = le64toh(f->header->n_entries);
1557 if (n <= 0)
1558 return 0;
cec736d2
LP
1559
1560 if (!o)
de190aef 1561 i = direction == DIRECTION_DOWN ? 0 : n - 1;
cec736d2 1562 else {
de190aef 1563 if (o->object.type != OBJECT_ENTRY)
cec736d2
LP
1564 return -EINVAL;
1565
de190aef
LP
1566 r = generic_array_bisect(f,
1567 le64toh(f->header->entry_array_offset),
1568 le64toh(f->header->n_entries),
1569 p,
1570 test_object_offset,
1571 DIRECTION_DOWN,
1572 NULL, NULL,
1573 &i);
1574 if (r <= 0)
1575 return r;
1576
1577 if (direction == DIRECTION_DOWN) {
1578 if (i >= n - 1)
1579 return 0;
1580
1581 i++;
1582 } else {
1583 if (i <= 0)
1584 return 0;
1585
1586 i--;
1587 }
cec736d2
LP
1588 }
1589
de190aef
LP
1590 /* And jump to it */
1591 return generic_array_get(f,
1592 le64toh(f->header->entry_array_offset),
1593 i,
1594 ret, offset);
1595}
cec736d2 1596
de190aef
LP
1597int journal_file_skip_entry(
1598 JournalFile *f,
1599 Object *o, uint64_t p,
1600 int64_t skip,
1601 Object **ret, uint64_t *offset) {
1602
1603 uint64_t i, n;
1604 int r;
1605
1606 assert(f);
1607 assert(o);
1608 assert(p > 0);
1609
1610 if (o->object.type != OBJECT_ENTRY)
1611 return -EINVAL;
1612
1613 r = generic_array_bisect(f,
1614 le64toh(f->header->entry_array_offset),
1615 le64toh(f->header->n_entries),
1616 p,
1617 test_object_offset,
1618 DIRECTION_DOWN,
1619 NULL, NULL,
1620 &i);
1621 if (r <= 0)
cec736d2
LP
1622 return r;
1623
de190aef
LP
1624 /* Calculate new index */
1625 if (skip < 0) {
1626 if ((uint64_t) -skip >= i)
1627 i = 0;
1628 else
1629 i = i - (uint64_t) -skip;
1630 } else
1631 i += (uint64_t) skip;
cec736d2 1632
de190aef
LP
1633 n = le64toh(f->header->n_entries);
1634 if (n <= 0)
1635 return -EBADMSG;
cec736d2 1636
de190aef
LP
1637 if (i >= n)
1638 i = n-1;
1639
1640 return generic_array_get(f,
1641 le64toh(f->header->entry_array_offset),
1642 i,
1643 ret, offset);
cec736d2
LP
1644}
1645
de190aef
LP
1646int journal_file_next_entry_for_data(
1647 JournalFile *f,
1648 Object *o, uint64_t p,
1649 uint64_t data_offset,
1650 direction_t direction,
1651 Object **ret, uint64_t *offset) {
1652
1653 uint64_t n, i;
cec736d2 1654 int r;
de190aef 1655 Object *d;
cec736d2
LP
1656
1657 assert(f);
de190aef 1658 assert(p > 0 || !o);
cec736d2 1659
de190aef 1660 r = journal_file_move_to_object(f, OBJECT_DATA, data_offset, &d);
466ccd92 1661 if (r < 0)
de190aef 1662 return r;
cec736d2 1663
de190aef
LP
1664 n = le64toh(d->data.n_entries);
1665 if (n <= 0)
1666 return n;
cec736d2 1667
de190aef
LP
1668 if (!o)
1669 i = direction == DIRECTION_DOWN ? 0 : n - 1;
1670 else {
1671 if (o->object.type != OBJECT_ENTRY)
1672 return -EINVAL;
cec736d2 1673
de190aef
LP
1674 r = generic_array_bisect_plus_one(f,
1675 le64toh(d->data.entry_offset),
1676 le64toh(d->data.entry_array_offset),
1677 le64toh(d->data.n_entries),
1678 p,
1679 test_object_offset,
1680 DIRECTION_DOWN,
1681 NULL, NULL,
1682 &i);
1683
1684 if (r <= 0)
cec736d2
LP
1685 return r;
1686
de190aef
LP
1687 if (direction == DIRECTION_DOWN) {
1688 if (i >= n - 1)
1689 return 0;
cec736d2 1690
de190aef
LP
1691 i++;
1692 } else {
1693 if (i <= 0)
1694 return 0;
cec736d2 1695
de190aef
LP
1696 i--;
1697 }
cec736d2 1698
de190aef 1699 }
cec736d2 1700
de190aef
LP
1701 return generic_array_get_plus_one(f,
1702 le64toh(d->data.entry_offset),
1703 le64toh(d->data.entry_array_offset),
1704 i,
1705 ret, offset);
1706}
cec736d2 1707
cbdca852
LP
1708int journal_file_move_to_entry_by_offset_for_data(
1709 JournalFile *f,
1710 uint64_t data_offset,
1711 uint64_t p,
1712 direction_t direction,
1713 Object **ret, uint64_t *offset) {
1714
1715 int r;
1716 Object *d;
1717
1718 assert(f);
1719
1720 r = journal_file_move_to_object(f, OBJECT_DATA, data_offset, &d);
1721 if (r < 0)
1722 return r;
1723
1724 return generic_array_bisect_plus_one(f,
1725 le64toh(d->data.entry_offset),
1726 le64toh(d->data.entry_array_offset),
1727 le64toh(d->data.n_entries),
1728 p,
1729 test_object_offset,
1730 direction,
1731 ret, offset, NULL);
1732}
1733
1734int journal_file_move_to_entry_by_monotonic_for_data(
1735 JournalFile *f,
1736 uint64_t data_offset,
1737 sd_id128_t boot_id,
1738 uint64_t monotonic,
1739 direction_t direction,
1740 Object **ret, uint64_t *offset) {
1741
1742 char t[9+32+1] = "_BOOT_ID=";
1743 Object *o, *d;
1744 int r;
1745 uint64_t b, z;
1746
1747 assert(f);
1748
1749 /* First, seek by time */
1750 sd_id128_to_string(boot_id, t + 9);
1751 r = journal_file_find_data_object(f, t, strlen(t), &o, &b);
1752 if (r < 0)
1753 return r;
1754 if (r == 0)
1755 return -ENOENT;
1756
1757 r = generic_array_bisect_plus_one(f,
1758 le64toh(o->data.entry_offset),
1759 le64toh(o->data.entry_array_offset),
1760 le64toh(o->data.n_entries),
1761 monotonic,
1762 test_object_monotonic,
1763 direction,
1764 NULL, &z, NULL);
1765 if (r <= 0)
1766 return r;
1767
1768 /* And now, continue seeking until we find an entry that
1769 * exists in both bisection arrays */
1770
1771 for (;;) {
1772 Object *qo;
1773 uint64_t p, q;
1774
1775 r = journal_file_move_to_object(f, OBJECT_DATA, data_offset, &d);
1776 if (r < 0)
1777 return r;
1778
1779 r = generic_array_bisect_plus_one(f,
1780 le64toh(d->data.entry_offset),
1781 le64toh(d->data.entry_array_offset),
1782 le64toh(d->data.n_entries),
1783 z,
1784 test_object_offset,
1785 direction,
1786 NULL, &p, NULL);
1787 if (r <= 0)
1788 return r;
1789
1790 r = journal_file_move_to_object(f, OBJECT_DATA, b, &o);
1791 if (r < 0)
1792 return r;
1793
1794 r = generic_array_bisect_plus_one(f,
1795 le64toh(o->data.entry_offset),
1796 le64toh(o->data.entry_array_offset),
1797 le64toh(o->data.n_entries),
1798 p,
1799 test_object_offset,
1800 direction,
1801 &qo, &q, NULL);
1802
1803 if (r <= 0)
1804 return r;
1805
1806 if (p == q) {
1807 if (ret)
1808 *ret = qo;
1809 if (offset)
1810 *offset = q;
1811
1812 return 1;
1813 }
1814
1815 z = q;
1816 }
1817
1818 return 0;
1819}
1820
de190aef
LP
1821int journal_file_move_to_entry_by_seqnum_for_data(
1822 JournalFile *f,
1823 uint64_t data_offset,
1824 uint64_t seqnum,
1825 direction_t direction,
1826 Object **ret, uint64_t *offset) {
cec736d2 1827
de190aef
LP
1828 Object *d;
1829 int r;
cec736d2 1830
91a31dde
LP
1831 assert(f);
1832
de190aef 1833 r = journal_file_move_to_object(f, OBJECT_DATA, data_offset, &d);
91a31dde 1834 if (r < 0)
de190aef 1835 return r;
cec736d2 1836
de190aef
LP
1837 return generic_array_bisect_plus_one(f,
1838 le64toh(d->data.entry_offset),
1839 le64toh(d->data.entry_array_offset),
1840 le64toh(d->data.n_entries),
1841 seqnum,
1842 test_object_seqnum,
1843 direction,
1844 ret, offset, NULL);
1845}
cec736d2 1846
de190aef
LP
1847int journal_file_move_to_entry_by_realtime_for_data(
1848 JournalFile *f,
1849 uint64_t data_offset,
1850 uint64_t realtime,
1851 direction_t direction,
1852 Object **ret, uint64_t *offset) {
1853
1854 Object *d;
1855 int r;
1856
91a31dde
LP
1857 assert(f);
1858
de190aef 1859 r = journal_file_move_to_object(f, OBJECT_DATA, data_offset, &d);
91a31dde 1860 if (r < 0)
de190aef
LP
1861 return r;
1862
1863 return generic_array_bisect_plus_one(f,
1864 le64toh(d->data.entry_offset),
1865 le64toh(d->data.entry_array_offset),
1866 le64toh(d->data.n_entries),
1867 realtime,
1868 test_object_realtime,
1869 direction,
1870 ret, offset, NULL);
cec736d2
LP
1871}
1872
0284adc6 1873void journal_file_dump(JournalFile *f) {
7560fffc 1874 Object *o;
7560fffc 1875 int r;
0284adc6 1876 uint64_t p;
7560fffc
LP
1877
1878 assert(f);
1879
0284adc6 1880 journal_file_print_header(f);
7560fffc 1881
0284adc6
LP
1882 p = le64toh(f->header->header_size);
1883 while (p != 0) {
1884 r = journal_file_move_to_object(f, -1, p, &o);
1885 if (r < 0)
1886 goto fail;
7560fffc 1887
0284adc6 1888 switch (o->object.type) {
d98cc1f2 1889
0284adc6
LP
1890 case OBJECT_UNUSED:
1891 printf("Type: OBJECT_UNUSED\n");
1892 break;
d98cc1f2 1893
0284adc6
LP
1894 case OBJECT_DATA:
1895 printf("Type: OBJECT_DATA\n");
1896 break;
7560fffc 1897
0284adc6 1898 case OBJECT_ENTRY:
f7fab8a5 1899 printf("Type: OBJECT_ENTRY seqnum=%llu monotonic=%llu realtime=%llu\n",
0284adc6
LP
1900 (unsigned long long) le64toh(o->entry.seqnum),
1901 (unsigned long long) le64toh(o->entry.monotonic),
1902 (unsigned long long) le64toh(o->entry.realtime));
1903 break;
7560fffc 1904
0284adc6
LP
1905 case OBJECT_FIELD_HASH_TABLE:
1906 printf("Type: OBJECT_FIELD_HASH_TABLE\n");
1907 break;
7560fffc 1908
0284adc6
LP
1909 case OBJECT_DATA_HASH_TABLE:
1910 printf("Type: OBJECT_DATA_HASH_TABLE\n");
1911 break;
7560fffc 1912
0284adc6
LP
1913 case OBJECT_ENTRY_ARRAY:
1914 printf("Type: OBJECT_ENTRY_ARRAY\n");
1915 break;
7560fffc 1916
0284adc6 1917 case OBJECT_TAG:
f7fab8a5
LP
1918 printf("Type: OBJECT_TAG seqnum=%llu epoch=%llu\n",
1919 (unsigned long long) le64toh(o->tag.seqnum),
1920 (unsigned long long) le64toh(o->tag.epoch));
0284adc6
LP
1921 break;
1922 }
7560fffc 1923
0284adc6
LP
1924 if (o->object.flags & OBJECT_COMPRESSED)
1925 printf("Flags: COMPRESSED\n");
7560fffc 1926
0284adc6
LP
1927 if (p == le64toh(f->header->tail_object_offset))
1928 p = 0;
1929 else
1930 p = p + ALIGN64(le64toh(o->object.size));
1931 }
7560fffc 1932
0284adc6
LP
1933 return;
1934fail:
1935 log_error("File corrupt");
7560fffc
LP
1936}
1937
0284adc6
LP
1938void journal_file_print_header(JournalFile *f) {
1939 char a[33], b[33], c[33];
1940 char x[FORMAT_TIMESTAMP_MAX], y[FORMAT_TIMESTAMP_MAX];
a1a03e30
LP
1941 struct stat st;
1942 char bytes[FORMAT_BYTES_MAX];
7560fffc
LP
1943
1944 assert(f);
7560fffc 1945
0284adc6
LP
1946 printf("File Path: %s\n"
1947 "File ID: %s\n"
1948 "Machine ID: %s\n"
1949 "Boot ID: %s\n"
1950 "Sequential Number ID: %s\n"
1951 "State: %s\n"
1952 "Compatible Flags:%s%s\n"
1953 "Incompatible Flags:%s%s\n"
1954 "Header size: %llu\n"
1955 "Arena size: %llu\n"
1956 "Data Hash Table Size: %llu\n"
1957 "Field Hash Table Size: %llu\n"
0284adc6
LP
1958 "Rotate Suggested: %s\n"
1959 "Head Sequential Number: %llu\n"
1960 "Tail Sequential Number: %llu\n"
1961 "Head Realtime Timestamp: %s\n"
3223f44f
LP
1962 "Tail Realtime Timestamp: %s\n"
1963 "Objects: %llu\n"
1964 "Entry Objects: %llu\n",
0284adc6
LP
1965 f->path,
1966 sd_id128_to_string(f->header->file_id, a),
1967 sd_id128_to_string(f->header->machine_id, b),
1968 sd_id128_to_string(f->header->boot_id, c),
1969 sd_id128_to_string(f->header->seqnum_id, c),
3223f44f
LP
1970 f->header->state == STATE_OFFLINE ? "OFFLINE" :
1971 f->header->state == STATE_ONLINE ? "ONLINE" :
1972 f->header->state == STATE_ARCHIVED ? "ARCHIVED" : "UNKNOWN",
8088cbd3
LP
1973 JOURNAL_HEADER_SEALED(f->header) ? " SEALED" : "",
1974 (le32toh(f->header->compatible_flags) & ~HEADER_COMPATIBLE_SEALED) ? " ???" : "",
1975 JOURNAL_HEADER_COMPRESSED(f->header) ? " COMPRESSED" : "",
1976 (le32toh(f->header->incompatible_flags) & ~HEADER_INCOMPATIBLE_COMPRESSED) ? " ???" : "",
0284adc6
LP
1977 (unsigned long long) le64toh(f->header->header_size),
1978 (unsigned long long) le64toh(f->header->arena_size),
1979 (unsigned long long) le64toh(f->header->data_hash_table_size) / sizeof(HashItem),
1980 (unsigned long long) le64toh(f->header->field_hash_table_size) / sizeof(HashItem),
0284adc6
LP
1981 yes_no(journal_file_rotate_suggested(f)),
1982 (unsigned long long) le64toh(f->header->head_entry_seqnum),
1983 (unsigned long long) le64toh(f->header->tail_entry_seqnum),
1984 format_timestamp(x, sizeof(x), le64toh(f->header->head_entry_realtime)),
3223f44f
LP
1985 format_timestamp(y, sizeof(y), le64toh(f->header->tail_entry_realtime)),
1986 (unsigned long long) le64toh(f->header->n_objects),
1987 (unsigned long long) le64toh(f->header->n_entries));
7560fffc 1988
0284adc6
LP
1989 if (JOURNAL_HEADER_CONTAINS(f->header, n_data))
1990 printf("Data Objects: %llu\n"
1991 "Data Hash Table Fill: %.1f%%\n",
1992 (unsigned long long) le64toh(f->header->n_data),
1993 100.0 * (double) le64toh(f->header->n_data) / ((double) (le64toh(f->header->data_hash_table_size) / sizeof(HashItem))));
7560fffc 1994
0284adc6
LP
1995 if (JOURNAL_HEADER_CONTAINS(f->header, n_fields))
1996 printf("Field Objects: %llu\n"
1997 "Field Hash Table Fill: %.1f%%\n",
1998 (unsigned long long) le64toh(f->header->n_fields),
1999 100.0 * (double) le64toh(f->header->n_fields) / ((double) (le64toh(f->header->field_hash_table_size) / sizeof(HashItem))));
3223f44f
LP
2000
2001 if (JOURNAL_HEADER_CONTAINS(f->header, n_tags))
2002 printf("Tag Objects: %llu\n",
2003 (unsigned long long) le64toh(f->header->n_tags));
2004 if (JOURNAL_HEADER_CONTAINS(f->header, n_entry_arrays))
2005 printf("Entry Array Objects: %llu\n",
2006 (unsigned long long) le64toh(f->header->n_entry_arrays));
a1a03e30
LP
2007
2008 if (fstat(f->fd, &st) >= 0)
2009 printf("Disk usage: %s\n", format_bytes(bytes, sizeof(bytes), (off_t) st.st_blocks * 512ULL));
7560fffc
LP
2010}
2011
0284adc6
LP
2012int journal_file_open(
2013 const char *fname,
2014 int flags,
2015 mode_t mode,
2016 bool compress,
baed47c3 2017 bool seal,
0284adc6
LP
2018 JournalMetrics *metrics,
2019 MMapCache *mmap_cache,
2020 JournalFile *template,
2021 JournalFile **ret) {
7560fffc 2022
0284adc6
LP
2023 JournalFile *f;
2024 int r;
2025 bool newly_created = false;
7560fffc 2026
0284adc6 2027 assert(fname);
0559d3a5 2028 assert(ret);
7560fffc 2029
0284adc6
LP
2030 if ((flags & O_ACCMODE) != O_RDONLY &&
2031 (flags & O_ACCMODE) != O_RDWR)
2032 return -EINVAL;
7560fffc 2033
a0108012
LP
2034 if (!endswith(fname, ".journal") &&
2035 !endswith(fname, ".journal~"))
0284adc6 2036 return -EINVAL;
7560fffc 2037
0284adc6
LP
2038 f = new0(JournalFile, 1);
2039 if (!f)
2040 return -ENOMEM;
7560fffc 2041
0284adc6
LP
2042 f->fd = -1;
2043 f->mode = mode;
7560fffc 2044
0284adc6
LP
2045 f->flags = flags;
2046 f->prot = prot_from_flags(flags);
2047 f->writable = (flags & O_ACCMODE) != O_RDONLY;
48b61739 2048#ifdef HAVE_XZ
0284adc6 2049 f->compress = compress;
48b61739 2050#endif
49a32d43 2051#ifdef HAVE_GCRYPT
baed47c3 2052 f->seal = seal;
49a32d43 2053#endif
7560fffc 2054
0284adc6
LP
2055 if (mmap_cache)
2056 f->mmap = mmap_cache_ref(mmap_cache);
2057 else {
84168d80 2058 f->mmap = mmap_cache_new();
0284adc6
LP
2059 if (!f->mmap) {
2060 r = -ENOMEM;
2061 goto fail;
2062 }
2063 }
7560fffc 2064
0284adc6
LP
2065 f->path = strdup(fname);
2066 if (!f->path) {
2067 r = -ENOMEM;
2068 goto fail;
2069 }
7560fffc 2070
0284adc6
LP
2071 f->fd = open(f->path, f->flags|O_CLOEXEC, f->mode);
2072 if (f->fd < 0) {
2073 r = -errno;
2074 goto fail;
7560fffc 2075 }
7560fffc 2076
0284adc6
LP
2077 if (fstat(f->fd, &f->last_stat) < 0) {
2078 r = -errno;
2079 goto fail;
2080 }
7560fffc 2081
0284adc6
LP
2082 if (f->last_stat.st_size == 0 && f->writable) {
2083 newly_created = true;
7560fffc 2084
feb12d3e 2085#ifdef HAVE_GCRYPT
0284adc6 2086 /* Try to load the FSPRG state, and if we can't, then
baed47c3 2087 * just don't do sealing */
49a32d43
LP
2088 if (f->seal) {
2089 r = journal_file_fss_load(f);
2090 if (r < 0)
2091 f->seal = false;
2092 }
feb12d3e 2093#endif
7560fffc 2094
0284adc6
LP
2095 r = journal_file_init_header(f, template);
2096 if (r < 0)
2097 goto fail;
7560fffc 2098
0284adc6
LP
2099 if (fstat(f->fd, &f->last_stat) < 0) {
2100 r = -errno;
2101 goto fail;
2102 }
2103 }
7560fffc 2104
0284adc6
LP
2105 if (f->last_stat.st_size < (off_t) HEADER_SIZE_MIN) {
2106 r = -EIO;
2107 goto fail;
2108 }
7560fffc 2109
0284adc6
LP
2110 f->header = mmap(NULL, PAGE_ALIGN(sizeof(Header)), prot_from_flags(flags), MAP_SHARED, f->fd, 0);
2111 if (f->header == MAP_FAILED) {
2112 f->header = NULL;
2113 r = -errno;
2114 goto fail;
2115 }
7560fffc 2116
0284adc6
LP
2117 if (!newly_created) {
2118 r = journal_file_verify_header(f);
2119 if (r < 0)
2120 goto fail;
2121 }
7560fffc 2122
feb12d3e 2123#ifdef HAVE_GCRYPT
0284adc6 2124 if (!newly_created && f->writable) {
baed47c3 2125 r = journal_file_fss_load(f);
0284adc6
LP
2126 if (r < 0)
2127 goto fail;
2128 }
feb12d3e 2129#endif
cec736d2
LP
2130
2131 if (f->writable) {
4a92baf3
LP
2132 if (metrics) {
2133 journal_default_metrics(metrics, f->fd);
2134 f->metrics = *metrics;
2135 } else if (template)
2136 f->metrics = template->metrics;
2137
cec736d2
LP
2138 r = journal_file_refresh_header(f);
2139 if (r < 0)
2140 goto fail;
2141 }
2142
feb12d3e 2143#ifdef HAVE_GCRYPT
baed47c3 2144 r = journal_file_hmac_setup(f);
14d10188
LP
2145 if (r < 0)
2146 goto fail;
feb12d3e 2147#endif
14d10188 2148
cec736d2 2149 if (newly_created) {
de190aef 2150 r = journal_file_setup_field_hash_table(f);
cec736d2
LP
2151 if (r < 0)
2152 goto fail;
2153
de190aef 2154 r = journal_file_setup_data_hash_table(f);
cec736d2
LP
2155 if (r < 0)
2156 goto fail;
7560fffc 2157
feb12d3e 2158#ifdef HAVE_GCRYPT
7560fffc
LP
2159 r = journal_file_append_first_tag(f);
2160 if (r < 0)
2161 goto fail;
feb12d3e 2162#endif
cec736d2
LP
2163 }
2164
de190aef 2165 r = journal_file_map_field_hash_table(f);
cec736d2
LP
2166 if (r < 0)
2167 goto fail;
2168
de190aef 2169 r = journal_file_map_data_hash_table(f);
cec736d2
LP
2170 if (r < 0)
2171 goto fail;
2172
0559d3a5 2173 *ret = f;
cec736d2
LP
2174 return 0;
2175
2176fail:
2177 journal_file_close(f);
2178
2179 return r;
2180}
0ac38b70 2181
baed47c3 2182int journal_file_rotate(JournalFile **f, bool compress, bool seal) {
0ac38b70
LP
2183 char *p;
2184 size_t l;
2185 JournalFile *old_file, *new_file = NULL;
2186 int r;
2187
2188 assert(f);
2189 assert(*f);
2190
2191 old_file = *f;
2192
2193 if (!old_file->writable)
2194 return -EINVAL;
2195
2196 if (!endswith(old_file->path, ".journal"))
2197 return -EINVAL;
2198
2199 l = strlen(old_file->path);
2200
9447a7f1 2201 p = new(char, l + 1 + 32 + 1 + 16 + 1 + 16 + 1);
0ac38b70
LP
2202 if (!p)
2203 return -ENOMEM;
2204
2205 memcpy(p, old_file->path, l - 8);
2206 p[l-8] = '@';
2207 sd_id128_to_string(old_file->header->seqnum_id, p + l - 8 + 1);
2208 snprintf(p + l - 8 + 1 + 32, 1 + 16 + 1 + 16 + 8 + 1,
2209 "-%016llx-%016llx.journal",
beec0085 2210 (unsigned long long) le64toh((*f)->header->tail_entry_seqnum),
0ac38b70
LP
2211 (unsigned long long) le64toh((*f)->header->tail_entry_realtime));
2212
2213 r = rename(old_file->path, p);
2214 free(p);
2215
2216 if (r < 0)
2217 return -errno;
2218
ccdbaf91 2219 old_file->header->state = STATE_ARCHIVED;
0ac38b70 2220
baed47c3 2221 r = journal_file_open(old_file->path, old_file->flags, old_file->mode, compress, seal, NULL, old_file->mmap, old_file, &new_file);
0ac38b70
LP
2222 journal_file_close(old_file);
2223
2224 *f = new_file;
2225 return r;
2226}
2227
9447a7f1
LP
2228int journal_file_open_reliably(
2229 const char *fname,
2230 int flags,
2231 mode_t mode,
7560fffc 2232 bool compress,
baed47c3 2233 bool seal,
4a92baf3 2234 JournalMetrics *metrics,
27370278 2235 MMapCache *mmap_cache,
9447a7f1
LP
2236 JournalFile *template,
2237 JournalFile **ret) {
2238
2239 int r;
2240 size_t l;
2241 char *p;
2242
baed47c3 2243 r = journal_file_open(fname, flags, mode, compress, seal,
27370278 2244 metrics, mmap_cache, template, ret);
0071d9f1
LP
2245 if (r != -EBADMSG && /* corrupted */
2246 r != -ENODATA && /* truncated */
2247 r != -EHOSTDOWN && /* other machine */
a1a1898f
LP
2248 r != -EPROTONOSUPPORT && /* incompatible feature */
2249 r != -EBUSY && /* unclean shutdown */
2250 r != -ESHUTDOWN /* already archived */)
9447a7f1
LP
2251 return r;
2252
2253 if ((flags & O_ACCMODE) == O_RDONLY)
2254 return r;
2255
2256 if (!(flags & O_CREAT))
2257 return r;
2258
7560fffc
LP
2259 if (!endswith(fname, ".journal"))
2260 return r;
2261
5c70eab4
LP
2262 /* The file is corrupted. Rotate it away and try it again (but only once) */
2263
9447a7f1
LP
2264 l = strlen(fname);
2265 if (asprintf(&p, "%.*s@%016llx-%016llx.journal~",
2266 (int) (l-8), fname,
2267 (unsigned long long) now(CLOCK_REALTIME),
2268 random_ull()) < 0)
2269 return -ENOMEM;
2270
2271 r = rename(fname, p);
2272 free(p);
2273 if (r < 0)
2274 return -errno;
2275
a1a1898f 2276 log_warning("File %s corrupted or uncleanly shut down, renaming and replacing.", fname);
9447a7f1 2277
baed47c3 2278 return journal_file_open(fname, flags, mode, compress, seal,
27370278 2279 metrics, mmap_cache, template, ret);
9447a7f1
LP
2280}
2281
cf244689
LP
2282
2283int journal_file_copy_entry(JournalFile *from, JournalFile *to, Object *o, uint64_t p, uint64_t *seqnum, Object **ret, uint64_t *offset) {
2284 uint64_t i, n;
2285 uint64_t q, xor_hash = 0;
2286 int r;
2287 EntryItem *items;
2288 dual_timestamp ts;
2289
2290 assert(from);
2291 assert(to);
2292 assert(o);
2293 assert(p);
2294
2295 if (!to->writable)
2296 return -EPERM;
2297
2298 ts.monotonic = le64toh(o->entry.monotonic);
2299 ts.realtime = le64toh(o->entry.realtime);
2300
2301 if (to->tail_entry_monotonic_valid &&
2302 ts.monotonic < le64toh(to->header->tail_entry_monotonic))
2303 return -EINVAL;
2304
cf244689
LP
2305 n = journal_file_entry_n_items(o);
2306 items = alloca(sizeof(EntryItem) * n);
2307
2308 for (i = 0; i < n; i++) {
4fd052ae
FC
2309 uint64_t l, h;
2310 le64_t le_hash;
cf244689
LP
2311 size_t t;
2312 void *data;
2313 Object *u;
2314
2315 q = le64toh(o->entry.items[i].object_offset);
2316 le_hash = o->entry.items[i].hash;
2317
2318 r = journal_file_move_to_object(from, OBJECT_DATA, q, &o);
2319 if (r < 0)
2320 return r;
2321
2322 if (le_hash != o->data.hash)
2323 return -EBADMSG;
2324
2325 l = le64toh(o->object.size) - offsetof(Object, data.payload);
2326 t = (size_t) l;
2327
2328 /* We hit the limit on 32bit machines */
2329 if ((uint64_t) t != l)
2330 return -E2BIG;
2331
2332 if (o->object.flags & OBJECT_COMPRESSED) {
2333#ifdef HAVE_XZ
2334 uint64_t rsize;
2335
2336 if (!uncompress_blob(o->data.payload, l, &from->compress_buffer, &from->compress_buffer_size, &rsize))
2337 return -EBADMSG;
2338
2339 data = from->compress_buffer;
2340 l = rsize;
2341#else
2342 return -EPROTONOSUPPORT;
2343#endif
2344 } else
2345 data = o->data.payload;
2346
2347 r = journal_file_append_data(to, data, l, &u, &h);
2348 if (r < 0)
2349 return r;
2350
2351 xor_hash ^= le64toh(u->data.hash);
2352 items[i].object_offset = htole64(h);
2353 items[i].hash = u->data.hash;
2354
2355 r = journal_file_move_to_object(from, OBJECT_ENTRY, p, &o);
2356 if (r < 0)
2357 return r;
2358 }
2359
2360 return journal_file_append_entry_internal(to, &ts, xor_hash, items, n, seqnum, ret, offset);
2361}
babfc091
LP
2362
2363void journal_default_metrics(JournalMetrics *m, int fd) {
2364 uint64_t fs_size = 0;
2365 struct statvfs ss;
a7bc2c2a 2366 char a[FORMAT_BYTES_MAX], b[FORMAT_BYTES_MAX], c[FORMAT_BYTES_MAX], d[FORMAT_BYTES_MAX];
babfc091
LP
2367
2368 assert(m);
2369 assert(fd >= 0);
2370
2371 if (fstatvfs(fd, &ss) >= 0)
2372 fs_size = ss.f_frsize * ss.f_blocks;
2373
2374 if (m->max_use == (uint64_t) -1) {
2375
2376 if (fs_size > 0) {
2377 m->max_use = PAGE_ALIGN(fs_size / 10); /* 10% of file system size */
2378
2379 if (m->max_use > DEFAULT_MAX_USE_UPPER)
2380 m->max_use = DEFAULT_MAX_USE_UPPER;
2381
2382 if (m->max_use < DEFAULT_MAX_USE_LOWER)
2383 m->max_use = DEFAULT_MAX_USE_LOWER;
2384 } else
2385 m->max_use = DEFAULT_MAX_USE_LOWER;
2386 } else {
2387 m->max_use = PAGE_ALIGN(m->max_use);
2388
2389 if (m->max_use < JOURNAL_FILE_SIZE_MIN*2)
2390 m->max_use = JOURNAL_FILE_SIZE_MIN*2;
2391 }
2392
2393 if (m->max_size == (uint64_t) -1) {
2394 m->max_size = PAGE_ALIGN(m->max_use / 8); /* 8 chunks */
2395
2396 if (m->max_size > DEFAULT_MAX_SIZE_UPPER)
2397 m->max_size = DEFAULT_MAX_SIZE_UPPER;
2398 } else
2399 m->max_size = PAGE_ALIGN(m->max_size);
2400
2401 if (m->max_size < JOURNAL_FILE_SIZE_MIN)
2402 m->max_size = JOURNAL_FILE_SIZE_MIN;
2403
2404 if (m->max_size*2 > m->max_use)
2405 m->max_use = m->max_size*2;
2406
2407 if (m->min_size == (uint64_t) -1)
2408 m->min_size = JOURNAL_FILE_SIZE_MIN;
2409 else {
2410 m->min_size = PAGE_ALIGN(m->min_size);
2411
2412 if (m->min_size < JOURNAL_FILE_SIZE_MIN)
2413 m->min_size = JOURNAL_FILE_SIZE_MIN;
2414
2415 if (m->min_size > m->max_size)
2416 m->max_size = m->min_size;
2417 }
2418
2419 if (m->keep_free == (uint64_t) -1) {
2420
2421 if (fs_size > 0) {
2422 m->keep_free = PAGE_ALIGN(fs_size / 20); /* 5% of file system size */
2423
2424 if (m->keep_free > DEFAULT_KEEP_FREE_UPPER)
2425 m->keep_free = DEFAULT_KEEP_FREE_UPPER;
2426
2427 } else
2428 m->keep_free = DEFAULT_KEEP_FREE;
2429 }
2430
2b43f939
LP
2431 log_debug("Fixed max_use=%s max_size=%s min_size=%s keep_free=%s",
2432 format_bytes(a, sizeof(a), m->max_use),
2433 format_bytes(b, sizeof(b), m->max_size),
2434 format_bytes(c, sizeof(c), m->min_size),
2435 format_bytes(d, sizeof(d), m->keep_free));
babfc091 2436}
08984293
LP
2437
2438int journal_file_get_cutoff_realtime_usec(JournalFile *f, usec_t *from, usec_t *to) {
08984293
LP
2439 assert(f);
2440 assert(from || to);
2441
2442 if (from) {
162566a4
LP
2443 if (f->header->head_entry_realtime == 0)
2444 return -ENOENT;
08984293 2445
162566a4 2446 *from = le64toh(f->header->head_entry_realtime);
08984293
LP
2447 }
2448
2449 if (to) {
162566a4
LP
2450 if (f->header->tail_entry_realtime == 0)
2451 return -ENOENT;
08984293 2452
162566a4 2453 *to = le64toh(f->header->tail_entry_realtime);
08984293
LP
2454 }
2455
2456 return 1;
2457}
2458
2459int journal_file_get_cutoff_monotonic_usec(JournalFile *f, sd_id128_t boot_id, usec_t *from, usec_t *to) {
2460 char t[9+32+1] = "_BOOT_ID=";
2461 Object *o;
2462 uint64_t p;
2463 int r;
2464
2465 assert(f);
2466 assert(from || to);
2467
2468 sd_id128_to_string(boot_id, t + 9);
2469
2470 r = journal_file_find_data_object(f, t, strlen(t), &o, &p);
2471 if (r <= 0)
2472 return r;
2473
2474 if (le64toh(o->data.n_entries) <= 0)
2475 return 0;
2476
2477 if (from) {
2478 r = journal_file_move_to_object(f, OBJECT_ENTRY, le64toh(o->data.entry_offset), &o);
2479 if (r < 0)
2480 return r;
2481
2482 *from = le64toh(o->entry.monotonic);
2483 }
2484
2485 if (to) {
2486 r = journal_file_move_to_object(f, OBJECT_DATA, p, &o);
2487 if (r < 0)
2488 return r;
2489
2490 r = generic_array_get_plus_one(f,
2491 le64toh(o->data.entry_offset),
2492 le64toh(o->data.entry_array_offset),
2493 le64toh(o->data.n_entries)-1,
2494 &o, NULL);
2495 if (r <= 0)
2496 return r;
2497
2498 *to = le64toh(o->entry.monotonic);
2499 }
2500
2501 return 1;
2502}
dca6219e
LP
2503
2504bool journal_file_rotate_suggested(JournalFile *f) {
2505 assert(f);
2506
2507 /* If we gained new header fields we gained new features,
2508 * hence suggest a rotation */
361f9cbc
LP
2509 if (le64toh(f->header->header_size) < sizeof(Header)) {
2510 log_debug("%s uses an outdated header, suggesting rotation.", f->path);
dca6219e 2511 return true;
361f9cbc 2512 }
dca6219e
LP
2513
2514 /* Let's check if the hash tables grew over a certain fill
2515 * level (75%, borrowing this value from Java's hash table
2516 * implementation), and if so suggest a rotation. To calculate
2517 * the fill level we need the n_data field, which only exists
2518 * in newer versions. */
2519
2520 if (JOURNAL_HEADER_CONTAINS(f->header, n_data))
361f9cbc
LP
2521 if (le64toh(f->header->n_data) * 4ULL > (le64toh(f->header->data_hash_table_size) / sizeof(HashItem)) * 3ULL) {
2522 log_debug("Data hash table of %s has a fill level at %.1f (%llu of %llu items, %llu file size, %llu bytes per hash table item), suggesting rotation.",
2523 f->path,
2524 100.0 * (double) le64toh(f->header->n_data) / ((double) (le64toh(f->header->data_hash_table_size) / sizeof(HashItem))),
2525 (unsigned long long) le64toh(f->header->n_data),
2526 (unsigned long long) (le64toh(f->header->data_hash_table_size) / sizeof(HashItem)),
2527 (unsigned long long) (f->last_stat.st_size),
2528 (unsigned long long) (f->last_stat.st_size / le64toh(f->header->n_data)));
dca6219e 2529 return true;
361f9cbc 2530 }
dca6219e
LP
2531
2532 if (JOURNAL_HEADER_CONTAINS(f->header, n_fields))
361f9cbc
LP
2533 if (le64toh(f->header->n_fields) * 4ULL > (le64toh(f->header->field_hash_table_size) / sizeof(HashItem)) * 3ULL) {
2534 log_debug("Field hash table of %s has a fill level at %.1f (%llu of %llu items), suggesting rotation.",
2535 f->path,
2536 100.0 * (double) le64toh(f->header->n_fields) / ((double) (le64toh(f->header->field_hash_table_size) / sizeof(HashItem))),
2537 (unsigned long long) le64toh(f->header->n_fields),
2538 (unsigned long long) (le64toh(f->header->field_hash_table_size) / sizeof(HashItem)));
dca6219e 2539 return true;
361f9cbc 2540 }
dca6219e
LP
2541
2542 return false;
2543}