]> git.ipfire.org Git - thirdparty/systemd.git/blame - src/journal/journal-file.c
journal: estimate data hash table size a bit larger by default
[thirdparty/systemd.git] / src / journal / journal-file.c
CommitLineData
cec736d2
LP
1/*-*- Mode: C; c-basic-offset: 8; indent-tabs-mode: nil -*-*/
2
3/***
4 This file is part of systemd.
5
6 Copyright 2011 Lennart Poettering
7
8 systemd is free software; you can redistribute it and/or modify it
5430f7f2
LP
9 under the terms of the GNU Lesser General Public License as published by
10 the Free Software Foundation; either version 2.1 of the License, or
cec736d2
LP
11 (at your option) any later version.
12
13 systemd is distributed in the hope that it will be useful, but
14 WITHOUT ANY WARRANTY; without even the implied warranty of
15 MERCHANTABILITY or FITNESS FOR A PARTICULAR PURPOSE. See the GNU
5430f7f2 16 Lesser General Public License for more details.
cec736d2 17
5430f7f2 18 You should have received a copy of the GNU Lesser General Public License
cec736d2
LP
19 along with systemd; If not, see <http://www.gnu.org/licenses/>.
20***/
21
22#include <sys/mman.h>
23#include <errno.h>
24#include <sys/uio.h>
25#include <unistd.h>
26#include <sys/statvfs.h>
27#include <fcntl.h>
28#include <stddef.h>
29
30#include "journal-def.h"
31#include "journal-file.h"
32#include "lookup3.h"
807e17f0 33#include "compress.h"
cec736d2 34
4a92baf3
LP
35#define DEFAULT_DATA_HASH_TABLE_SIZE (2047ULL*sizeof(HashItem))
36#define DEFAULT_FIELD_HASH_TABLE_SIZE (333ULL*sizeof(HashItem))
cec736d2 37
1fa80181 38#define DEFAULT_WINDOW_SIZE (8ULL*1024ULL*1024ULL)
cec736d2 39
be19b7df 40#define COMPRESSION_SIZE_THRESHOLD (512ULL)
807e17f0 41
babfc091 42/* This is the minimum journal file size */
b47ffcfd 43#define JOURNAL_FILE_SIZE_MIN (64ULL*1024ULL) /* 64 KiB */
babfc091
LP
44
45/* These are the lower and upper bounds if we deduce the max_use value
46 * from the file system size */
47#define DEFAULT_MAX_USE_LOWER (1ULL*1024ULL*1024ULL) /* 1 MiB */
48#define DEFAULT_MAX_USE_UPPER (4ULL*1024ULL*1024ULL*1024ULL) /* 4 GiB */
49
50/* This is the upper bound if we deduce max_size from max_use */
71100051 51#define DEFAULT_MAX_SIZE_UPPER (128ULL*1024ULL*1024ULL) /* 128 MiB */
babfc091
LP
52
53/* This is the upper bound if we deduce the keep_free value from the
54 * file system size */
55#define DEFAULT_KEEP_FREE_UPPER (4ULL*1024ULL*1024ULL*1024ULL) /* 4 GiB */
56
57/* This is the keep_free value when we can't determine the system
58 * size */
59#define DEFAULT_KEEP_FREE (1024ULL*1024ULL) /* 1 MB */
60
dca6219e
LP
61/* n_data was the first entry we added after the initial file format design */
62#define HEADER_SIZE_MIN ALIGN64(offsetof(Header, n_data))
cec736d2
LP
63
64#define ALIGN64(x) (((x) + 7ULL) & ~7ULL)
65
dca6219e
LP
66#define JOURNAL_HEADER_CONTAINS(h, field) \
67 (le64toh((h)->header_size) >= offsetof(Header, field) + sizeof((h)->field))
68
69static const char signature[] = { 'L', 'P', 'K', 'S', 'H', 'H', 'R', 'H' };
70
cec736d2 71void journal_file_close(JournalFile *f) {
de190aef 72 int t;
cec736d2 73
de190aef 74 assert(f);
cec736d2 75
d384c7a8 76 if (f->header) {
cd96b3b8
LP
77 /* Mark the file offline. Don't override the archived state if it already is set */
78 if (f->writable && f->header->state == STATE_ONLINE)
d384c7a8 79 f->header->state = STATE_OFFLINE;
cec736d2 80
d384c7a8
MS
81 munmap(f->header, PAGE_ALIGN(sizeof(Header)));
82 }
cec736d2 83
de190aef
LP
84 for (t = 0; t < _WINDOW_MAX; t++)
85 if (f->windows[t].ptr)
86 munmap(f->windows[t].ptr, f->windows[t].size);
cec736d2 87
0ac38b70
LP
88 if (f->fd >= 0)
89 close_nointr_nofail(f->fd);
90
cec736d2 91 free(f->path);
807e17f0
LP
92
93#ifdef HAVE_XZ
94 free(f->compress_buffer);
95#endif
96
cec736d2
LP
97 free(f);
98}
99
0ac38b70 100static int journal_file_init_header(JournalFile *f, JournalFile *template) {
cec736d2
LP
101 Header h;
102 ssize_t k;
103 int r;
104
105 assert(f);
106
107 zero(h);
108 memcpy(h.signature, signature, 8);
23b0b2b2 109 h.header_size = htole64(ALIGN64(sizeof(h)));
cec736d2
LP
110
111 r = sd_id128_randomize(&h.file_id);
112 if (r < 0)
113 return r;
114
0ac38b70
LP
115 if (template) {
116 h.seqnum_id = template->header->seqnum_id;
dca6219e 117 h.tail_seqnum = template->header->tail_seqnum;
0ac38b70
LP
118 } else
119 h.seqnum_id = h.file_id;
cec736d2
LP
120
121 k = pwrite(f->fd, &h, sizeof(h), 0);
122 if (k < 0)
123 return -errno;
124
125 if (k != sizeof(h))
126 return -EIO;
127
128 return 0;
129}
130
131static int journal_file_refresh_header(JournalFile *f) {
132 int r;
de190aef 133 sd_id128_t boot_id;
cec736d2
LP
134
135 assert(f);
136
137 r = sd_id128_get_machine(&f->header->machine_id);
138 if (r < 0)
139 return r;
140
de190aef 141 r = sd_id128_get_boot(&boot_id);
cec736d2
LP
142 if (r < 0)
143 return r;
144
de190aef
LP
145 if (sd_id128_equal(boot_id, f->header->boot_id))
146 f->tail_entry_monotonic_valid = true;
147
148 f->header->boot_id = boot_id;
149
150 f->header->state = STATE_ONLINE;
b788cc23
LP
151
152 __sync_synchronize();
153
cec736d2
LP
154 return 0;
155}
156
157static int journal_file_verify_header(JournalFile *f) {
158 assert(f);
159
160 if (memcmp(f->header, signature, 8))
161 return -EBADMSG;
162
807e17f0
LP
163#ifdef HAVE_XZ
164 if ((le64toh(f->header->incompatible_flags) & ~HEADER_INCOMPATIBLE_COMPRESSED) != 0)
165 return -EPROTONOSUPPORT;
166#else
cec736d2
LP
167 if (f->header->incompatible_flags != 0)
168 return -EPROTONOSUPPORT;
807e17f0 169#endif
cec736d2 170
dca6219e
LP
171 /* The first addition was n_data, so check that we are at least this large */
172 if (le64toh(f->header->header_size) < HEADER_SIZE_MIN)
23b0b2b2
LP
173 return -EBADMSG;
174
175 if ((uint64_t) f->last_stat.st_size < (le64toh(f->header->header_size) + le64toh(f->header->arena_size)))
cec736d2
LP
176 return -ENODATA;
177
178 if (f->writable) {
ccdbaf91 179 uint8_t state;
cec736d2
LP
180 sd_id128_t machine_id;
181 int r;
182
183 r = sd_id128_get_machine(&machine_id);
184 if (r < 0)
185 return r;
186
187 if (!sd_id128_equal(machine_id, f->header->machine_id))
188 return -EHOSTDOWN;
189
de190aef 190 state = f->header->state;
cec736d2 191
71fa6f00
LP
192 if (state == STATE_ONLINE) {
193 log_debug("Journal file %s is already online. Assuming unclean closing.", f->path);
194 return -EBUSY;
195 } else if (state == STATE_ARCHIVED)
cec736d2 196 return -ESHUTDOWN;
71fa6f00
LP
197 else if (state != STATE_OFFLINE) {
198 log_debug("Journal file %s has unknown state %u.", f->path, state);
199 return -EBUSY;
200 }
cec736d2
LP
201 }
202
203 return 0;
204}
205
206static int journal_file_allocate(JournalFile *f, uint64_t offset, uint64_t size) {
cec736d2 207 uint64_t old_size, new_size;
fec2aa2f 208 int r;
cec736d2
LP
209
210 assert(f);
211
cec736d2 212 /* We assume that this file is not sparse, and we know that
38ac38b2 213 * for sure, since we always call posix_fallocate()
cec736d2
LP
214 * ourselves */
215
216 old_size =
23b0b2b2 217 le64toh(f->header->header_size) +
cec736d2
LP
218 le64toh(f->header->arena_size);
219
bc85bfee 220 new_size = PAGE_ALIGN(offset + size);
23b0b2b2
LP
221 if (new_size < le64toh(f->header->header_size))
222 new_size = le64toh(f->header->header_size);
bc85bfee
LP
223
224 if (new_size <= old_size)
cec736d2
LP
225 return 0;
226
bc85bfee
LP
227 if (f->metrics.max_size > 0 &&
228 new_size > f->metrics.max_size)
229 return -E2BIG;
cec736d2 230
bc85bfee
LP
231 if (new_size > f->metrics.min_size &&
232 f->metrics.keep_free > 0) {
cec736d2
LP
233 struct statvfs svfs;
234
235 if (fstatvfs(f->fd, &svfs) >= 0) {
236 uint64_t available;
237
238 available = svfs.f_bfree * svfs.f_bsize;
239
bc85bfee
LP
240 if (available >= f->metrics.keep_free)
241 available -= f->metrics.keep_free;
cec736d2
LP
242 else
243 available = 0;
244
245 if (new_size - old_size > available)
246 return -E2BIG;
247 }
248 }
249
bc85bfee
LP
250 /* Note that the glibc fallocate() fallback is very
251 inefficient, hence we try to minimize the allocation area
252 as we can. */
fec2aa2f
GV
253 r = posix_fallocate(f->fd, old_size, new_size - old_size);
254 if (r != 0)
255 return -r;
cec736d2
LP
256
257 if (fstat(f->fd, &f->last_stat) < 0)
258 return -errno;
259
23b0b2b2 260 f->header->arena_size = htole64(new_size - le64toh(f->header->header_size));
cec736d2
LP
261
262 return 0;
263}
264
265static int journal_file_map(
266 JournalFile *f,
267 uint64_t offset,
268 uint64_t size,
269 void **_window,
270 uint64_t *_woffset,
271 uint64_t *_wsize,
272 void **ret) {
273
274 uint64_t woffset, wsize;
275 void *window;
276
277 assert(f);
278 assert(size > 0);
279 assert(ret);
280
281 woffset = offset & ~((uint64_t) page_size() - 1ULL);
282 wsize = size + (offset - woffset);
283 wsize = PAGE_ALIGN(wsize);
284
2a59ea54
LP
285 /* Avoid SIGBUS on invalid accesses */
286 if (woffset + wsize > (uint64_t) PAGE_ALIGN(f->last_stat.st_size))
287 return -EADDRNOTAVAIL;
288
cec736d2
LP
289 window = mmap(NULL, wsize, f->prot, MAP_SHARED, f->fd, woffset);
290 if (window == MAP_FAILED)
291 return -errno;
292
293 if (_window)
294 *_window = window;
295
296 if (_woffset)
297 *_woffset = woffset;
298
299 if (_wsize)
300 *_wsize = wsize;
301
302 *ret = (uint8_t*) window + (offset - woffset);
303
304 return 0;
305}
306
de190aef 307static int journal_file_move_to(JournalFile *f, int wt, uint64_t offset, uint64_t size, void **ret) {
6c8a39b8 308 void *p = NULL;
cec736d2
LP
309 uint64_t delta;
310 int r;
de190aef 311 Window *w;
cec736d2
LP
312
313 assert(f);
314 assert(ret);
de190aef
LP
315 assert(wt >= 0);
316 assert(wt < _WINDOW_MAX);
cec736d2 317
4bbdcdb3
LP
318 if (offset + size > (uint64_t) f->last_stat.st_size) {
319 /* Hmm, out of range? Let's refresh the fstat() data
320 * first, before we trust that check. */
321
322 if (fstat(f->fd, &f->last_stat) < 0 ||
323 offset + size > (uint64_t) f->last_stat.st_size)
324 return -EADDRNOTAVAIL;
325 }
326
de190aef 327 w = f->windows + wt;
cec736d2 328
de190aef
LP
329 if (_likely_(w->ptr &&
330 w->offset <= offset &&
331 w->offset + w->size >= offset + size)) {
332
333 *ret = (uint8_t*) w->ptr + (offset - w->offset);
cec736d2
LP
334 return 0;
335 }
336
de190aef
LP
337 if (w->ptr) {
338 if (munmap(w->ptr, w->size) < 0)
cec736d2
LP
339 return -errno;
340
de190aef
LP
341 w->ptr = NULL;
342 w->size = w->offset = 0;
cec736d2
LP
343 }
344
345 if (size < DEFAULT_WINDOW_SIZE) {
346 /* If the default window size is larger then what was
347 * asked for extend the mapping a bit in the hope to
348 * minimize needed remappings later on. We add half
349 * the window space before and half behind the
350 * requested mapping */
351
1921a5cb 352 delta = (DEFAULT_WINDOW_SIZE - size) / 2;
cec736d2 353
a99c349d 354 if (delta > offset)
cec736d2
LP
355 delta = offset;
356
357 offset -= delta;
a99c349d 358 size = DEFAULT_WINDOW_SIZE;
cec736d2
LP
359 } else
360 delta = 0;
361
2a59ea54 362 if (offset + size > (uint64_t) f->last_stat.st_size)
1921a5cb 363 size = (uint64_t) f->last_stat.st_size - offset;
2a59ea54
LP
364
365 if (size <= 0)
366 return -EADDRNOTAVAIL;
367
cec736d2
LP
368 r = journal_file_map(f,
369 offset, size,
de190aef
LP
370 &w->ptr, &w->offset, &w->size,
371 &p);
cec736d2
LP
372
373 if (r < 0)
374 return r;
375
376 *ret = (uint8_t*) p + delta;
377 return 0;
378}
379
380static bool verify_hash(Object *o) {
de190aef 381 uint64_t h1, h2;
cec736d2
LP
382
383 assert(o);
384
807e17f0 385 if (o->object.type == OBJECT_DATA && !(o->object.flags & OBJECT_COMPRESSED)) {
cec736d2 386 h1 = le64toh(o->data.hash);
de190aef
LP
387 h2 = hash64(o->data.payload, le64toh(o->object.size) - offsetof(Object, data.payload));
388 } else if (o->object.type == OBJECT_FIELD) {
389 h1 = le64toh(o->field.hash);
390 h2 = hash64(o->field.payload, le64toh(o->object.size) - offsetof(Object, field.payload));
391 } else
392 return true;
cec736d2 393
de190aef 394 return h1 == h2;
cec736d2
LP
395}
396
de190aef 397int journal_file_move_to_object(JournalFile *f, int type, uint64_t offset, Object **ret) {
cec736d2
LP
398 int r;
399 void *t;
400 Object *o;
401 uint64_t s;
402
403 assert(f);
404 assert(ret);
de190aef 405 assert(type < _OBJECT_TYPE_MAX);
cec736d2 406
de190aef 407 r = journal_file_move_to(f, type >= 0 ? type : WINDOW_UNKNOWN, offset, sizeof(ObjectHeader), &t);
cec736d2
LP
408 if (r < 0)
409 return r;
410
411 o = (Object*) t;
412 s = le64toh(o->object.size);
413
414 if (s < sizeof(ObjectHeader))
415 return -EBADMSG;
416
de190aef 417 if (type >= 0 && o->object.type != type)
cec736d2
LP
418 return -EBADMSG;
419
420 if (s > sizeof(ObjectHeader)) {
de190aef 421 r = journal_file_move_to(f, o->object.type, offset, s, &t);
cec736d2
LP
422 if (r < 0)
423 return r;
424
425 o = (Object*) t;
426 }
427
428 if (!verify_hash(o))
429 return -EBADMSG;
430
431 *ret = o;
432 return 0;
433}
434
c2373f84 435static uint64_t journal_file_seqnum(JournalFile *f, uint64_t *seqnum) {
cec736d2
LP
436 uint64_t r;
437
438 assert(f);
439
dca6219e 440 r = le64toh(f->header->tail_seqnum) + 1;
c2373f84
LP
441
442 if (seqnum) {
de190aef 443 /* If an external seqnum counter was passed, we update
c2373f84
LP
444 * both the local and the external one, and set it to
445 * the maximum of both */
446
447 if (*seqnum + 1 > r)
448 r = *seqnum + 1;
449
450 *seqnum = r;
451 }
452
dca6219e 453 f->header->tail_seqnum = htole64(r);
cec736d2 454
dca6219e
LP
455 if (f->header->head_seqnum == 0)
456 f->header->head_seqnum = htole64(r);
de190aef 457
cec736d2
LP
458 return r;
459}
460
de190aef 461static int journal_file_append_object(JournalFile *f, int type, uint64_t size, Object **ret, uint64_t *offset) {
cec736d2
LP
462 int r;
463 uint64_t p;
464 Object *tail, *o;
465 void *t;
466
467 assert(f);
468 assert(size >= sizeof(ObjectHeader));
469 assert(offset);
470 assert(ret);
471
472 p = le64toh(f->header->tail_object_offset);
cec736d2 473 if (p == 0)
23b0b2b2 474 p = le64toh(f->header->header_size);
cec736d2 475 else {
de190aef 476 r = journal_file_move_to_object(f, -1, p, &tail);
cec736d2
LP
477 if (r < 0)
478 return r;
479
480 p += ALIGN64(le64toh(tail->object.size));
481 }
482
483 r = journal_file_allocate(f, p, size);
484 if (r < 0)
485 return r;
486
de190aef 487 r = journal_file_move_to(f, type, p, size, &t);
cec736d2
LP
488 if (r < 0)
489 return r;
490
491 o = (Object*) t;
492
493 zero(o->object);
de190aef 494 o->object.type = type;
cec736d2
LP
495 o->object.size = htole64(size);
496
497 f->header->tail_object_offset = htole64(p);
cec736d2
LP
498 f->header->n_objects = htole64(le64toh(f->header->n_objects) + 1);
499
500 *ret = o;
501 *offset = p;
502
503 return 0;
504}
505
de190aef 506static int journal_file_setup_data_hash_table(JournalFile *f) {
cec736d2
LP
507 uint64_t s, p;
508 Object *o;
509 int r;
510
511 assert(f);
512
dfabe643 513 /* We estimate that we need 1 hash table entry per 768 of
4a92baf3
LP
514 journal file and we want to make sure we never get beyond
515 75% fill level. Calculate the hash table size for the
516 maximum file size based on these metrics. */
517
dfabe643 518 s = (f->metrics.max_size * 4 / 768 / 3) * sizeof(HashItem);
4a92baf3
LP
519 if (s < DEFAULT_DATA_HASH_TABLE_SIZE)
520 s = DEFAULT_DATA_HASH_TABLE_SIZE;
521
dfabe643 522 log_info("Reserving %llu entries in hash table.", (unsigned long long) (s / sizeof(HashItem)));
4a92baf3 523
de190aef
LP
524 r = journal_file_append_object(f,
525 OBJECT_DATA_HASH_TABLE,
526 offsetof(Object, hash_table.items) + s,
527 &o, &p);
cec736d2
LP
528 if (r < 0)
529 return r;
530
de190aef 531 memset(o->hash_table.items, 0, s);
cec736d2 532
de190aef
LP
533 f->header->data_hash_table_offset = htole64(p + offsetof(Object, hash_table.items));
534 f->header->data_hash_table_size = htole64(s);
cec736d2
LP
535
536 return 0;
537}
538
de190aef 539static int journal_file_setup_field_hash_table(JournalFile *f) {
cec736d2
LP
540 uint64_t s, p;
541 Object *o;
542 int r;
543
544 assert(f);
545
de190aef
LP
546 s = DEFAULT_FIELD_HASH_TABLE_SIZE;
547 r = journal_file_append_object(f,
548 OBJECT_FIELD_HASH_TABLE,
549 offsetof(Object, hash_table.items) + s,
550 &o, &p);
cec736d2
LP
551 if (r < 0)
552 return r;
553
de190aef 554 memset(o->hash_table.items, 0, s);
cec736d2 555
de190aef
LP
556 f->header->field_hash_table_offset = htole64(p + offsetof(Object, hash_table.items));
557 f->header->field_hash_table_size = htole64(s);
cec736d2
LP
558
559 return 0;
560}
561
de190aef 562static int journal_file_map_data_hash_table(JournalFile *f) {
cec736d2
LP
563 uint64_t s, p;
564 void *t;
565 int r;
566
567 assert(f);
568
de190aef
LP
569 p = le64toh(f->header->data_hash_table_offset);
570 s = le64toh(f->header->data_hash_table_size);
cec736d2 571
de190aef
LP
572 r = journal_file_move_to(f,
573 WINDOW_DATA_HASH_TABLE,
574 p, s,
575 &t);
cec736d2
LP
576 if (r < 0)
577 return r;
578
de190aef 579 f->data_hash_table = t;
cec736d2
LP
580 return 0;
581}
582
de190aef 583static int journal_file_map_field_hash_table(JournalFile *f) {
cec736d2
LP
584 uint64_t s, p;
585 void *t;
586 int r;
587
588 assert(f);
589
de190aef
LP
590 p = le64toh(f->header->field_hash_table_offset);
591 s = le64toh(f->header->field_hash_table_size);
cec736d2 592
de190aef
LP
593 r = journal_file_move_to(f,
594 WINDOW_FIELD_HASH_TABLE,
595 p, s,
596 &t);
cec736d2
LP
597 if (r < 0)
598 return r;
599
de190aef 600 f->field_hash_table = t;
cec736d2
LP
601 return 0;
602}
603
de190aef
LP
604static int journal_file_link_data(JournalFile *f, Object *o, uint64_t offset, uint64_t hash) {
605 uint64_t p, h;
cec736d2
LP
606 int r;
607
608 assert(f);
609 assert(o);
610 assert(offset > 0);
de190aef 611 assert(o->object.type == OBJECT_DATA);
cec736d2 612
48496df6
LP
613 /* This might alter the window we are looking at */
614
de190aef
LP
615 o->data.next_hash_offset = o->data.next_field_offset = 0;
616 o->data.entry_offset = o->data.entry_array_offset = 0;
617 o->data.n_entries = 0;
cec736d2 618
de190aef 619 h = hash % (le64toh(f->header->data_hash_table_size) / sizeof(HashItem));
8db4213e 620 p = le64toh(f->data_hash_table[h].tail_hash_offset);
cec736d2
LP
621 if (p == 0) {
622 /* Only entry in the hash table is easy */
de190aef 623 f->data_hash_table[h].head_hash_offset = htole64(offset);
cec736d2 624 } else {
48496df6
LP
625 /* Move back to the previous data object, to patch in
626 * pointer */
cec736d2 627
de190aef 628 r = journal_file_move_to_object(f, OBJECT_DATA, p, &o);
cec736d2
LP
629 if (r < 0)
630 return r;
631
de190aef 632 o->data.next_hash_offset = htole64(offset);
cec736d2
LP
633 }
634
de190aef 635 f->data_hash_table[h].tail_hash_offset = htole64(offset);
cec736d2 636
dca6219e
LP
637 if (JOURNAL_HEADER_CONTAINS(f->header, n_data))
638 f->header->n_data = htole64(le64toh(f->header->n_data) + 1);
639
cec736d2
LP
640 return 0;
641}
642
de190aef
LP
643int journal_file_find_data_object_with_hash(
644 JournalFile *f,
645 const void *data, uint64_t size, uint64_t hash,
646 Object **ret, uint64_t *offset) {
48496df6 647
de190aef 648 uint64_t p, osize, h;
cec736d2
LP
649 int r;
650
651 assert(f);
652 assert(data || size == 0);
653
654 osize = offsetof(Object, data.payload) + size;
655
bc85bfee
LP
656 if (f->header->data_hash_table_size == 0)
657 return -EBADMSG;
658
de190aef
LP
659 h = hash % (le64toh(f->header->data_hash_table_size) / sizeof(HashItem));
660 p = le64toh(f->data_hash_table[h].head_hash_offset);
cec736d2 661
de190aef
LP
662 while (p > 0) {
663 Object *o;
cec736d2 664
de190aef 665 r = journal_file_move_to_object(f, OBJECT_DATA, p, &o);
cec736d2
LP
666 if (r < 0)
667 return r;
668
807e17f0 669 if (le64toh(o->data.hash) != hash)
85a131e8 670 goto next;
807e17f0
LP
671
672 if (o->object.flags & OBJECT_COMPRESSED) {
673#ifdef HAVE_XZ
b785c858 674 uint64_t l, rsize;
cec736d2 675
807e17f0
LP
676 l = le64toh(o->object.size);
677 if (l <= offsetof(Object, data.payload))
cec736d2
LP
678 return -EBADMSG;
679
807e17f0
LP
680 l -= offsetof(Object, data.payload);
681
682 if (!uncompress_blob(o->data.payload, l, &f->compress_buffer, &f->compress_buffer_size, &rsize))
683 return -EBADMSG;
684
b785c858 685 if (rsize == size &&
807e17f0
LP
686 memcmp(f->compress_buffer, data, size) == 0) {
687
688 if (ret)
689 *ret = o;
690
691 if (offset)
692 *offset = p;
693
694 return 1;
695 }
696#else
697 return -EPROTONOSUPPORT;
698#endif
699
700 } else if (le64toh(o->object.size) == osize &&
701 memcmp(o->data.payload, data, size) == 0) {
702
cec736d2
LP
703 if (ret)
704 *ret = o;
705
706 if (offset)
707 *offset = p;
708
de190aef 709 return 1;
cec736d2
LP
710 }
711
85a131e8 712 next:
cec736d2
LP
713 p = le64toh(o->data.next_hash_offset);
714 }
715
de190aef
LP
716 return 0;
717}
718
719int journal_file_find_data_object(
720 JournalFile *f,
721 const void *data, uint64_t size,
722 Object **ret, uint64_t *offset) {
723
724 uint64_t hash;
725
726 assert(f);
727 assert(data || size == 0);
728
729 hash = hash64(data, size);
730
731 return journal_file_find_data_object_with_hash(f,
732 data, size, hash,
733 ret, offset);
734}
735
48496df6
LP
736static int journal_file_append_data(
737 JournalFile *f,
738 const void *data, uint64_t size,
739 Object **ret, uint64_t *offset) {
740
de190aef
LP
741 uint64_t hash, p;
742 uint64_t osize;
743 Object *o;
744 int r;
807e17f0 745 bool compressed = false;
de190aef
LP
746
747 assert(f);
748 assert(data || size == 0);
749
750 hash = hash64(data, size);
751
752 r = journal_file_find_data_object_with_hash(f, data, size, hash, &o, &p);
753 if (r < 0)
754 return r;
755 else if (r > 0) {
756
757 if (ret)
758 *ret = o;
759
760 if (offset)
761 *offset = p;
762
763 return 0;
764 }
765
766 osize = offsetof(Object, data.payload) + size;
767 r = journal_file_append_object(f, OBJECT_DATA, osize, &o, &p);
cec736d2
LP
768 if (r < 0)
769 return r;
770
cec736d2 771 o->data.hash = htole64(hash);
807e17f0
LP
772
773#ifdef HAVE_XZ
774 if (f->compress &&
775 size >= COMPRESSION_SIZE_THRESHOLD) {
776 uint64_t rsize;
777
778 compressed = compress_blob(data, size, o->data.payload, &rsize);
779
780 if (compressed) {
781 o->object.size = htole64(offsetof(Object, data.payload) + rsize);
782 o->object.flags |= OBJECT_COMPRESSED;
783
784 f->header->incompatible_flags = htole32(le32toh(f->header->incompatible_flags) | HEADER_INCOMPATIBLE_COMPRESSED);
785
786 log_debug("Compressed data object %lu -> %lu", (unsigned long) size, (unsigned long) rsize);
787 }
788 }
789#endif
790
791 if (!compressed)
792 memcpy(o->data.payload, data, size);
cec736d2 793
de190aef 794 r = journal_file_link_data(f, o, p, hash);
cec736d2
LP
795 if (r < 0)
796 return r;
797
48496df6
LP
798 /* The linking might have altered the window, so let's
799 * refresh our pointer */
800 r = journal_file_move_to_object(f, OBJECT_DATA, p, &o);
801 if (r < 0)
802 return r;
803
cec736d2
LP
804 if (ret)
805 *ret = o;
806
807 if (offset)
de190aef 808 *offset = p;
cec736d2
LP
809
810 return 0;
811}
812
813uint64_t journal_file_entry_n_items(Object *o) {
814 assert(o);
7be3aa17 815 assert(o->object.type == OBJECT_ENTRY);
cec736d2
LP
816
817 return (le64toh(o->object.size) - offsetof(Object, entry.items)) / sizeof(EntryItem);
818}
819
de190aef
LP
820static uint64_t journal_file_entry_array_n_items(Object *o) {
821 assert(o);
7be3aa17 822 assert(o->object.type == OBJECT_ENTRY_ARRAY);
de190aef
LP
823
824 return (le64toh(o->object.size) - offsetof(Object, entry_array.items)) / sizeof(uint64_t);
825}
826
827static int link_entry_into_array(JournalFile *f,
4fd052ae
FC
828 le64_t *first,
829 le64_t *idx,
de190aef 830 uint64_t p) {
cec736d2 831 int r;
de190aef
LP
832 uint64_t n = 0, ap = 0, q, i, a, hidx;
833 Object *o;
834
cec736d2 835 assert(f);
de190aef
LP
836 assert(first);
837 assert(idx);
838 assert(p > 0);
cec736d2 839
de190aef
LP
840 a = le64toh(*first);
841 i = hidx = le64toh(*idx);
842 while (a > 0) {
843
844 r = journal_file_move_to_object(f, OBJECT_ENTRY_ARRAY, a, &o);
845 if (r < 0)
846 return r;
cec736d2 847
de190aef
LP
848 n = journal_file_entry_array_n_items(o);
849 if (i < n) {
850 o->entry_array.items[i] = htole64(p);
851 *idx = htole64(hidx + 1);
852 return 0;
853 }
cec736d2 854
de190aef
LP
855 i -= n;
856 ap = a;
857 a = le64toh(o->entry_array.next_entry_array_offset);
858 }
859
860 if (hidx > n)
861 n = (hidx+1) * 2;
862 else
863 n = n * 2;
864
865 if (n < 4)
866 n = 4;
867
868 r = journal_file_append_object(f, OBJECT_ENTRY_ARRAY,
869 offsetof(Object, entry_array.items) + n * sizeof(uint64_t),
870 &o, &q);
cec736d2
LP
871 if (r < 0)
872 return r;
873
de190aef 874 o->entry_array.items[i] = htole64(p);
cec736d2 875
de190aef 876 if (ap == 0)
7be3aa17 877 *first = htole64(q);
cec736d2 878 else {
de190aef 879 r = journal_file_move_to_object(f, OBJECT_ENTRY_ARRAY, ap, &o);
cec736d2
LP
880 if (r < 0)
881 return r;
882
de190aef
LP
883 o->entry_array.next_entry_array_offset = htole64(q);
884 }
cec736d2 885
de190aef
LP
886 *idx = htole64(hidx + 1);
887
888 return 0;
889}
cec736d2 890
de190aef 891static int link_entry_into_array_plus_one(JournalFile *f,
4fd052ae
FC
892 le64_t *extra,
893 le64_t *first,
894 le64_t *idx,
de190aef
LP
895 uint64_t p) {
896
897 int r;
898
899 assert(f);
900 assert(extra);
901 assert(first);
902 assert(idx);
903 assert(p > 0);
904
905 if (*idx == 0)
906 *extra = htole64(p);
907 else {
4fd052ae 908 le64_t i;
de190aef 909
7be3aa17 910 i = htole64(le64toh(*idx) - 1);
de190aef
LP
911 r = link_entry_into_array(f, first, &i, p);
912 if (r < 0)
913 return r;
cec736d2
LP
914 }
915
de190aef
LP
916 *idx = htole64(le64toh(*idx) + 1);
917 return 0;
918}
919
920static int journal_file_link_entry_item(JournalFile *f, Object *o, uint64_t offset, uint64_t i) {
921 uint64_t p;
922 int r;
923 assert(f);
924 assert(o);
925 assert(offset > 0);
926
927 p = le64toh(o->entry.items[i].object_offset);
928 if (p == 0)
929 return -EINVAL;
930
931 r = journal_file_move_to_object(f, OBJECT_DATA, p, &o);
cec736d2
LP
932 if (r < 0)
933 return r;
934
de190aef
LP
935 return link_entry_into_array_plus_one(f,
936 &o->data.entry_offset,
937 &o->data.entry_array_offset,
938 &o->data.n_entries,
939 offset);
cec736d2
LP
940}
941
942static int journal_file_link_entry(JournalFile *f, Object *o, uint64_t offset) {
de190aef 943 uint64_t n, i;
cec736d2
LP
944 int r;
945
946 assert(f);
947 assert(o);
948 assert(offset > 0);
de190aef 949 assert(o->object.type == OBJECT_ENTRY);
cec736d2 950
b788cc23
LP
951 __sync_synchronize();
952
cec736d2 953 /* Link up the entry itself */
de190aef
LP
954 r = link_entry_into_array(f,
955 &f->header->entry_array_offset,
956 &f->header->n_entries,
957 offset);
958 if (r < 0)
959 return r;
cec736d2 960
aaf53376 961 /* log_debug("=> %s seqnr=%lu n_entries=%lu", f->path, (unsigned long) o->entry.seqnum, (unsigned long) f->header->n_entries); */
cec736d2 962
de190aef 963 if (f->header->head_entry_realtime == 0)
0ac38b70 964 f->header->head_entry_realtime = o->entry.realtime;
cec736d2 965
0ac38b70 966 f->header->tail_entry_realtime = o->entry.realtime;
de190aef
LP
967 f->header->tail_entry_monotonic = o->entry.monotonic;
968
969 f->tail_entry_monotonic_valid = true;
cec736d2
LP
970
971 /* Link up the items */
972 n = journal_file_entry_n_items(o);
973 for (i = 0; i < n; i++) {
974 r = journal_file_link_entry_item(f, o, offset, i);
975 if (r < 0)
976 return r;
977 }
978
cec736d2
LP
979 return 0;
980}
981
982static int journal_file_append_entry_internal(
983 JournalFile *f,
984 const dual_timestamp *ts,
985 uint64_t xor_hash,
986 const EntryItem items[], unsigned n_items,
de190aef 987 uint64_t *seqnum,
cec736d2
LP
988 Object **ret, uint64_t *offset) {
989 uint64_t np;
990 uint64_t osize;
991 Object *o;
992 int r;
993
994 assert(f);
995 assert(items || n_items == 0);
de190aef 996 assert(ts);
cec736d2
LP
997
998 osize = offsetof(Object, entry.items) + (n_items * sizeof(EntryItem));
999
de190aef 1000 r = journal_file_append_object(f, OBJECT_ENTRY, osize, &o, &np);
cec736d2
LP
1001 if (r < 0)
1002 return r;
1003
de190aef 1004 o->entry.seqnum = htole64(journal_file_seqnum(f, seqnum));
cec736d2 1005 memcpy(o->entry.items, items, n_items * sizeof(EntryItem));
de190aef
LP
1006 o->entry.realtime = htole64(ts->realtime);
1007 o->entry.monotonic = htole64(ts->monotonic);
cec736d2
LP
1008 o->entry.xor_hash = htole64(xor_hash);
1009 o->entry.boot_id = f->header->boot_id;
1010
1011 r = journal_file_link_entry(f, o, np);
1012 if (r < 0)
1013 return r;
1014
1015 if (ret)
1016 *ret = o;
1017
1018 if (offset)
1019 *offset = np;
1020
1021 return 0;
1022}
1023
cf244689 1024void journal_file_post_change(JournalFile *f) {
50f20cfd
LP
1025 assert(f);
1026
1027 /* inotify() does not receive IN_MODIFY events from file
1028 * accesses done via mmap(). After each access we hence
1029 * trigger IN_MODIFY by truncating the journal file to its
1030 * current size which triggers IN_MODIFY. */
1031
bc85bfee
LP
1032 __sync_synchronize();
1033
50f20cfd
LP
1034 if (ftruncate(f->fd, f->last_stat.st_size) < 0)
1035 log_error("Failed to to truncate file to its own size: %m");
1036}
1037
de190aef 1038int journal_file_append_entry(JournalFile *f, const dual_timestamp *ts, const struct iovec iovec[], unsigned n_iovec, uint64_t *seqnum, Object **ret, uint64_t *offset) {
cec736d2
LP
1039 unsigned i;
1040 EntryItem *items;
1041 int r;
1042 uint64_t xor_hash = 0;
de190aef 1043 struct dual_timestamp _ts;
cec736d2
LP
1044
1045 assert(f);
1046 assert(iovec || n_iovec == 0);
1047
de190aef
LP
1048 if (!f->writable)
1049 return -EPERM;
1050
1051 if (!ts) {
1052 dual_timestamp_get(&_ts);
1053 ts = &_ts;
1054 }
1055
1056 if (f->tail_entry_monotonic_valid &&
1057 ts->monotonic < le64toh(f->header->tail_entry_monotonic))
1058 return -EINVAL;
1059
cf244689 1060 items = alloca(sizeof(EntryItem) * n_iovec);
cec736d2
LP
1061
1062 for (i = 0; i < n_iovec; i++) {
1063 uint64_t p;
1064 Object *o;
1065
1066 r = journal_file_append_data(f, iovec[i].iov_base, iovec[i].iov_len, &o, &p);
1067 if (r < 0)
cf244689 1068 return r;
cec736d2
LP
1069
1070 xor_hash ^= le64toh(o->data.hash);
1071 items[i].object_offset = htole64(p);
de7b95cd 1072 items[i].hash = o->data.hash;
cec736d2
LP
1073 }
1074
de190aef 1075 r = journal_file_append_entry_internal(f, ts, xor_hash, items, n_iovec, seqnum, ret, offset);
cec736d2 1076
50f20cfd
LP
1077 journal_file_post_change(f);
1078
cec736d2
LP
1079 return r;
1080}
1081
de190aef
LP
1082static int generic_array_get(JournalFile *f,
1083 uint64_t first,
1084 uint64_t i,
1085 Object **ret, uint64_t *offset) {
1086
cec736d2 1087 Object *o;
6c8a39b8 1088 uint64_t p = 0, a;
cec736d2
LP
1089 int r;
1090
1091 assert(f);
1092
de190aef
LP
1093 a = first;
1094 while (a > 0) {
1095 uint64_t n;
cec736d2 1096
de190aef
LP
1097 r = journal_file_move_to_object(f, OBJECT_ENTRY_ARRAY, a, &o);
1098 if (r < 0)
1099 return r;
cec736d2 1100
de190aef
LP
1101 n = journal_file_entry_array_n_items(o);
1102 if (i < n) {
1103 p = le64toh(o->entry_array.items[i]);
1104 break;
cec736d2
LP
1105 }
1106
de190aef
LP
1107 i -= n;
1108 a = le64toh(o->entry_array.next_entry_array_offset);
1109 }
1110
1111 if (a <= 0 || p <= 0)
1112 return 0;
1113
1114 r = journal_file_move_to_object(f, OBJECT_ENTRY, p, &o);
1115 if (r < 0)
1116 return r;
1117
1118 if (ret)
1119 *ret = o;
1120
1121 if (offset)
1122 *offset = p;
1123
1124 return 1;
1125}
1126
1127static int generic_array_get_plus_one(JournalFile *f,
1128 uint64_t extra,
1129 uint64_t first,
1130 uint64_t i,
1131 Object **ret, uint64_t *offset) {
1132
1133 Object *o;
1134
1135 assert(f);
1136
1137 if (i == 0) {
1138 int r;
1139
1140 r = journal_file_move_to_object(f, OBJECT_ENTRY, extra, &o);
cec736d2
LP
1141 if (r < 0)
1142 return r;
1143
de190aef
LP
1144 if (ret)
1145 *ret = o;
cec736d2 1146
de190aef
LP
1147 if (offset)
1148 *offset = extra;
cec736d2 1149
de190aef 1150 return 1;
cec736d2
LP
1151 }
1152
de190aef
LP
1153 return generic_array_get(f, first, i-1, ret, offset);
1154}
cec736d2 1155
de190aef
LP
1156enum {
1157 TEST_FOUND,
1158 TEST_LEFT,
1159 TEST_RIGHT
1160};
cec736d2 1161
de190aef
LP
1162static int generic_array_bisect(JournalFile *f,
1163 uint64_t first,
1164 uint64_t n,
1165 uint64_t needle,
1166 int (*test_object)(JournalFile *f, uint64_t p, uint64_t needle),
1167 direction_t direction,
1168 Object **ret,
1169 uint64_t *offset,
1170 uint64_t *idx) {
1171
1172 uint64_t a, p, t = 0, i = 0, last_p = 0;
1173 bool subtract_one = false;
1174 Object *o, *array = NULL;
1175 int r;
cec736d2 1176
de190aef
LP
1177 assert(f);
1178 assert(test_object);
cec736d2 1179
de190aef
LP
1180 a = first;
1181 while (a > 0) {
1182 uint64_t left, right, k, lp;
1183
1184 r = journal_file_move_to_object(f, OBJECT_ENTRY_ARRAY, a, &array);
cec736d2
LP
1185 if (r < 0)
1186 return r;
1187
de190aef
LP
1188 k = journal_file_entry_array_n_items(array);
1189 right = MIN(k, n);
1190 if (right <= 0)
1191 return 0;
cec736d2 1192
de190aef
LP
1193 i = right - 1;
1194 lp = p = le64toh(array->entry_array.items[i]);
1195 if (p <= 0)
1196 return -EBADMSG;
cec736d2 1197
de190aef
LP
1198 r = test_object(f, p, needle);
1199 if (r < 0)
1200 return r;
cec736d2 1201
de190aef
LP
1202 if (r == TEST_FOUND)
1203 r = direction == DIRECTION_DOWN ? TEST_RIGHT : TEST_LEFT;
1204
1205 if (r == TEST_RIGHT) {
1206 left = 0;
1207 right -= 1;
1208 for (;;) {
1209 if (left == right) {
1210 if (direction == DIRECTION_UP)
1211 subtract_one = true;
1212
1213 i = left;
1214 goto found;
1215 }
1216
1217 assert(left < right);
1218
1219 i = (left + right) / 2;
1220 p = le64toh(array->entry_array.items[i]);
1221 if (p <= 0)
1222 return -EBADMSG;
1223
1224 r = test_object(f, p, needle);
1225 if (r < 0)
1226 return r;
cec736d2 1227
de190aef
LP
1228 if (r == TEST_FOUND)
1229 r = direction == DIRECTION_DOWN ? TEST_RIGHT : TEST_LEFT;
1230
1231 if (r == TEST_RIGHT)
1232 right = i;
1233 else
1234 left = i + 1;
1235 }
1236 }
1237
cbdca852
LP
1238 if (k > n) {
1239 if (direction == DIRECTION_UP) {
1240 i = n;
1241 subtract_one = true;
1242 goto found;
1243 }
1244
cec736d2 1245 return 0;
cbdca852 1246 }
cec736d2 1247
de190aef
LP
1248 last_p = lp;
1249
1250 n -= k;
1251 t += k;
1252 a = le64toh(array->entry_array.next_entry_array_offset);
cec736d2
LP
1253 }
1254
1255 return 0;
de190aef
LP
1256
1257found:
1258 if (subtract_one && t == 0 && i == 0)
1259 return 0;
1260
1261 if (subtract_one && i == 0)
1262 p = last_p;
1263 else if (subtract_one)
1264 p = le64toh(array->entry_array.items[i-1]);
1265 else
1266 p = le64toh(array->entry_array.items[i]);
1267
1268 r = journal_file_move_to_object(f, OBJECT_ENTRY, p, &o);
1269 if (r < 0)
1270 return r;
1271
1272 if (ret)
1273 *ret = o;
1274
1275 if (offset)
1276 *offset = p;
1277
1278 if (idx)
cbdca852 1279 *idx = t + i + (subtract_one ? -1 : 0);
de190aef
LP
1280
1281 return 1;
cec736d2
LP
1282}
1283
de190aef
LP
1284static int generic_array_bisect_plus_one(JournalFile *f,
1285 uint64_t extra,
1286 uint64_t first,
1287 uint64_t n,
1288 uint64_t needle,
1289 int (*test_object)(JournalFile *f, uint64_t p, uint64_t needle),
1290 direction_t direction,
1291 Object **ret,
1292 uint64_t *offset,
1293 uint64_t *idx) {
1294
cec736d2 1295 int r;
cbdca852
LP
1296 bool step_back = false;
1297 Object *o;
cec736d2
LP
1298
1299 assert(f);
de190aef 1300 assert(test_object);
cec736d2 1301
de190aef
LP
1302 if (n <= 0)
1303 return 0;
cec736d2 1304
de190aef
LP
1305 /* This bisects the array in object 'first', but first checks
1306 * an extra */
de190aef
LP
1307 r = test_object(f, extra, needle);
1308 if (r < 0)
1309 return r;
a536e261
LP
1310
1311 if (r == TEST_FOUND)
1312 r = direction == DIRECTION_DOWN ? TEST_RIGHT : TEST_LEFT;
1313
cbdca852
LP
1314 /* if we are looking with DIRECTION_UP then we need to first
1315 see if in the actual array there is a matching entry, and
1316 return the last one of that. But if there isn't any we need
1317 to return this one. Hence remember this, and return it
1318 below. */
1319 if (r == TEST_LEFT)
1320 step_back = direction == DIRECTION_UP;
de190aef 1321
cbdca852
LP
1322 if (r == TEST_RIGHT) {
1323 if (direction == DIRECTION_DOWN)
1324 goto found;
1325 else
1326 return 0;
a536e261 1327 }
cec736d2 1328
de190aef
LP
1329 r = generic_array_bisect(f, first, n-1, needle, test_object, direction, ret, offset, idx);
1330
cbdca852
LP
1331 if (r == 0 && step_back)
1332 goto found;
1333
ecf68b1d 1334 if (r > 0 && idx)
de190aef
LP
1335 (*idx) ++;
1336
1337 return r;
cbdca852
LP
1338
1339found:
1340 r = journal_file_move_to_object(f, OBJECT_ENTRY, extra, &o);
1341 if (r < 0)
1342 return r;
1343
1344 if (ret)
1345 *ret = o;
1346
1347 if (offset)
1348 *offset = extra;
1349
1350 if (idx)
1351 *idx = 0;
1352
1353 return 1;
1354}
1355
1356static int test_object_offset(JournalFile *f, uint64_t p, uint64_t needle) {
1357 assert(f);
1358 assert(p > 0);
1359
1360 if (p == needle)
1361 return TEST_FOUND;
1362 else if (p < needle)
1363 return TEST_LEFT;
1364 else
1365 return TEST_RIGHT;
1366}
1367
1368int journal_file_move_to_entry_by_offset(
1369 JournalFile *f,
1370 uint64_t p,
1371 direction_t direction,
1372 Object **ret,
1373 uint64_t *offset) {
1374
1375 return generic_array_bisect(f,
1376 le64toh(f->header->entry_array_offset),
1377 le64toh(f->header->n_entries),
1378 p,
1379 test_object_offset,
1380 direction,
1381 ret, offset, NULL);
de190aef
LP
1382}
1383
cbdca852 1384
de190aef
LP
1385static int test_object_seqnum(JournalFile *f, uint64_t p, uint64_t needle) {
1386 Object *o;
1387 int r;
1388
1389 assert(f);
1390 assert(p > 0);
1391
1392 r = journal_file_move_to_object(f, OBJECT_ENTRY, p, &o);
cec736d2
LP
1393 if (r < 0)
1394 return r;
1395
de190aef
LP
1396 if (le64toh(o->entry.seqnum) == needle)
1397 return TEST_FOUND;
1398 else if (le64toh(o->entry.seqnum) < needle)
1399 return TEST_LEFT;
1400 else
1401 return TEST_RIGHT;
1402}
cec736d2 1403
de190aef
LP
1404int journal_file_move_to_entry_by_seqnum(
1405 JournalFile *f,
1406 uint64_t seqnum,
1407 direction_t direction,
1408 Object **ret,
1409 uint64_t *offset) {
1410
1411 return generic_array_bisect(f,
1412 le64toh(f->header->entry_array_offset),
1413 le64toh(f->header->n_entries),
1414 seqnum,
1415 test_object_seqnum,
1416 direction,
1417 ret, offset, NULL);
1418}
cec736d2 1419
de190aef
LP
1420static int test_object_realtime(JournalFile *f, uint64_t p, uint64_t needle) {
1421 Object *o;
1422 int r;
1423
1424 assert(f);
1425 assert(p > 0);
1426
1427 r = journal_file_move_to_object(f, OBJECT_ENTRY, p, &o);
1428 if (r < 0)
1429 return r;
1430
1431 if (le64toh(o->entry.realtime) == needle)
1432 return TEST_FOUND;
1433 else if (le64toh(o->entry.realtime) < needle)
1434 return TEST_LEFT;
1435 else
1436 return TEST_RIGHT;
cec736d2
LP
1437}
1438
de190aef
LP
1439int journal_file_move_to_entry_by_realtime(
1440 JournalFile *f,
1441 uint64_t realtime,
1442 direction_t direction,
1443 Object **ret,
1444 uint64_t *offset) {
1445
1446 return generic_array_bisect(f,
1447 le64toh(f->header->entry_array_offset),
1448 le64toh(f->header->n_entries),
1449 realtime,
1450 test_object_realtime,
1451 direction,
1452 ret, offset, NULL);
1453}
1454
1455static int test_object_monotonic(JournalFile *f, uint64_t p, uint64_t needle) {
1456 Object *o;
1457 int r;
1458
1459 assert(f);
1460 assert(p > 0);
1461
1462 r = journal_file_move_to_object(f, OBJECT_ENTRY, p, &o);
1463 if (r < 0)
1464 return r;
1465
1466 if (le64toh(o->entry.monotonic) == needle)
1467 return TEST_FOUND;
1468 else if (le64toh(o->entry.monotonic) < needle)
1469 return TEST_LEFT;
1470 else
1471 return TEST_RIGHT;
1472}
1473
1474int journal_file_move_to_entry_by_monotonic(
1475 JournalFile *f,
1476 sd_id128_t boot_id,
1477 uint64_t monotonic,
1478 direction_t direction,
1479 Object **ret,
1480 uint64_t *offset) {
1481
10b6f904 1482 char t[9+32+1] = "_BOOT_ID=";
de190aef
LP
1483 Object *o;
1484 int r;
1485
cbdca852 1486 assert(f);
de190aef 1487
cbdca852 1488 sd_id128_to_string(boot_id, t + 9);
de190aef
LP
1489 r = journal_file_find_data_object(f, t, strlen(t), &o, NULL);
1490 if (r < 0)
1491 return r;
cbdca852 1492 if (r == 0)
de190aef
LP
1493 return -ENOENT;
1494
1495 return generic_array_bisect_plus_one(f,
1496 le64toh(o->data.entry_offset),
1497 le64toh(o->data.entry_array_offset),
1498 le64toh(o->data.n_entries),
1499 monotonic,
1500 test_object_monotonic,
1501 direction,
1502 ret, offset, NULL);
1503}
1504
de190aef
LP
1505int journal_file_next_entry(
1506 JournalFile *f,
1507 Object *o, uint64_t p,
1508 direction_t direction,
1509 Object **ret, uint64_t *offset) {
1510
1511 uint64_t i, n;
cec736d2
LP
1512 int r;
1513
1514 assert(f);
de190aef
LP
1515 assert(p > 0 || !o);
1516
1517 n = le64toh(f->header->n_entries);
1518 if (n <= 0)
1519 return 0;
cec736d2
LP
1520
1521 if (!o)
de190aef 1522 i = direction == DIRECTION_DOWN ? 0 : n - 1;
cec736d2 1523 else {
de190aef 1524 if (o->object.type != OBJECT_ENTRY)
cec736d2
LP
1525 return -EINVAL;
1526
de190aef
LP
1527 r = generic_array_bisect(f,
1528 le64toh(f->header->entry_array_offset),
1529 le64toh(f->header->n_entries),
1530 p,
1531 test_object_offset,
1532 DIRECTION_DOWN,
1533 NULL, NULL,
1534 &i);
1535 if (r <= 0)
1536 return r;
1537
1538 if (direction == DIRECTION_DOWN) {
1539 if (i >= n - 1)
1540 return 0;
1541
1542 i++;
1543 } else {
1544 if (i <= 0)
1545 return 0;
1546
1547 i--;
1548 }
cec736d2
LP
1549 }
1550
de190aef
LP
1551 /* And jump to it */
1552 return generic_array_get(f,
1553 le64toh(f->header->entry_array_offset),
1554 i,
1555 ret, offset);
1556}
cec736d2 1557
de190aef
LP
1558int journal_file_skip_entry(
1559 JournalFile *f,
1560 Object *o, uint64_t p,
1561 int64_t skip,
1562 Object **ret, uint64_t *offset) {
1563
1564 uint64_t i, n;
1565 int r;
1566
1567 assert(f);
1568 assert(o);
1569 assert(p > 0);
1570
1571 if (o->object.type != OBJECT_ENTRY)
1572 return -EINVAL;
1573
1574 r = generic_array_bisect(f,
1575 le64toh(f->header->entry_array_offset),
1576 le64toh(f->header->n_entries),
1577 p,
1578 test_object_offset,
1579 DIRECTION_DOWN,
1580 NULL, NULL,
1581 &i);
1582 if (r <= 0)
cec736d2
LP
1583 return r;
1584
de190aef
LP
1585 /* Calculate new index */
1586 if (skip < 0) {
1587 if ((uint64_t) -skip >= i)
1588 i = 0;
1589 else
1590 i = i - (uint64_t) -skip;
1591 } else
1592 i += (uint64_t) skip;
cec736d2 1593
de190aef
LP
1594 n = le64toh(f->header->n_entries);
1595 if (n <= 0)
1596 return -EBADMSG;
cec736d2 1597
de190aef
LP
1598 if (i >= n)
1599 i = n-1;
1600
1601 return generic_array_get(f,
1602 le64toh(f->header->entry_array_offset),
1603 i,
1604 ret, offset);
cec736d2
LP
1605}
1606
de190aef
LP
1607int journal_file_next_entry_for_data(
1608 JournalFile *f,
1609 Object *o, uint64_t p,
1610 uint64_t data_offset,
1611 direction_t direction,
1612 Object **ret, uint64_t *offset) {
1613
1614 uint64_t n, i;
cec736d2 1615 int r;
de190aef 1616 Object *d;
cec736d2
LP
1617
1618 assert(f);
de190aef 1619 assert(p > 0 || !o);
cec736d2 1620
de190aef 1621 r = journal_file_move_to_object(f, OBJECT_DATA, data_offset, &d);
466ccd92 1622 if (r < 0)
de190aef 1623 return r;
cec736d2 1624
de190aef
LP
1625 n = le64toh(d->data.n_entries);
1626 if (n <= 0)
1627 return n;
cec736d2 1628
de190aef
LP
1629 if (!o)
1630 i = direction == DIRECTION_DOWN ? 0 : n - 1;
1631 else {
1632 if (o->object.type != OBJECT_ENTRY)
1633 return -EINVAL;
cec736d2 1634
de190aef
LP
1635 r = generic_array_bisect_plus_one(f,
1636 le64toh(d->data.entry_offset),
1637 le64toh(d->data.entry_array_offset),
1638 le64toh(d->data.n_entries),
1639 p,
1640 test_object_offset,
1641 DIRECTION_DOWN,
1642 NULL, NULL,
1643 &i);
1644
1645 if (r <= 0)
cec736d2
LP
1646 return r;
1647
de190aef
LP
1648 if (direction == DIRECTION_DOWN) {
1649 if (i >= n - 1)
1650 return 0;
cec736d2 1651
de190aef
LP
1652 i++;
1653 } else {
1654 if (i <= 0)
1655 return 0;
cec736d2 1656
de190aef
LP
1657 i--;
1658 }
cec736d2 1659
de190aef 1660 }
cec736d2 1661
de190aef
LP
1662 return generic_array_get_plus_one(f,
1663 le64toh(d->data.entry_offset),
1664 le64toh(d->data.entry_array_offset),
1665 i,
1666 ret, offset);
1667}
cec736d2 1668
cbdca852
LP
1669int journal_file_move_to_entry_by_offset_for_data(
1670 JournalFile *f,
1671 uint64_t data_offset,
1672 uint64_t p,
1673 direction_t direction,
1674 Object **ret, uint64_t *offset) {
1675
1676 int r;
1677 Object *d;
1678
1679 assert(f);
1680
1681 r = journal_file_move_to_object(f, OBJECT_DATA, data_offset, &d);
1682 if (r < 0)
1683 return r;
1684
1685 return generic_array_bisect_plus_one(f,
1686 le64toh(d->data.entry_offset),
1687 le64toh(d->data.entry_array_offset),
1688 le64toh(d->data.n_entries),
1689 p,
1690 test_object_offset,
1691 direction,
1692 ret, offset, NULL);
1693}
1694
1695int journal_file_move_to_entry_by_monotonic_for_data(
1696 JournalFile *f,
1697 uint64_t data_offset,
1698 sd_id128_t boot_id,
1699 uint64_t monotonic,
1700 direction_t direction,
1701 Object **ret, uint64_t *offset) {
1702
1703 char t[9+32+1] = "_BOOT_ID=";
1704 Object *o, *d;
1705 int r;
1706 uint64_t b, z;
1707
1708 assert(f);
1709
1710 /* First, seek by time */
1711 sd_id128_to_string(boot_id, t + 9);
1712 r = journal_file_find_data_object(f, t, strlen(t), &o, &b);
1713 if (r < 0)
1714 return r;
1715 if (r == 0)
1716 return -ENOENT;
1717
1718 r = generic_array_bisect_plus_one(f,
1719 le64toh(o->data.entry_offset),
1720 le64toh(o->data.entry_array_offset),
1721 le64toh(o->data.n_entries),
1722 monotonic,
1723 test_object_monotonic,
1724 direction,
1725 NULL, &z, NULL);
1726 if (r <= 0)
1727 return r;
1728
1729 /* And now, continue seeking until we find an entry that
1730 * exists in both bisection arrays */
1731
1732 for (;;) {
1733 Object *qo;
1734 uint64_t p, q;
1735
1736 r = journal_file_move_to_object(f, OBJECT_DATA, data_offset, &d);
1737 if (r < 0)
1738 return r;
1739
1740 r = generic_array_bisect_plus_one(f,
1741 le64toh(d->data.entry_offset),
1742 le64toh(d->data.entry_array_offset),
1743 le64toh(d->data.n_entries),
1744 z,
1745 test_object_offset,
1746 direction,
1747 NULL, &p, NULL);
1748 if (r <= 0)
1749 return r;
1750
1751 r = journal_file_move_to_object(f, OBJECT_DATA, b, &o);
1752 if (r < 0)
1753 return r;
1754
1755 r = generic_array_bisect_plus_one(f,
1756 le64toh(o->data.entry_offset),
1757 le64toh(o->data.entry_array_offset),
1758 le64toh(o->data.n_entries),
1759 p,
1760 test_object_offset,
1761 direction,
1762 &qo, &q, NULL);
1763
1764 if (r <= 0)
1765 return r;
1766
1767 if (p == q) {
1768 if (ret)
1769 *ret = qo;
1770 if (offset)
1771 *offset = q;
1772
1773 return 1;
1774 }
1775
1776 z = q;
1777 }
1778
1779 return 0;
1780}
1781
de190aef
LP
1782int journal_file_move_to_entry_by_seqnum_for_data(
1783 JournalFile *f,
1784 uint64_t data_offset,
1785 uint64_t seqnum,
1786 direction_t direction,
1787 Object **ret, uint64_t *offset) {
cec736d2 1788
de190aef
LP
1789 Object *d;
1790 int r;
cec736d2 1791
91a31dde
LP
1792 assert(f);
1793
de190aef 1794 r = journal_file_move_to_object(f, OBJECT_DATA, data_offset, &d);
91a31dde 1795 if (r < 0)
de190aef 1796 return r;
cec736d2 1797
de190aef
LP
1798 return generic_array_bisect_plus_one(f,
1799 le64toh(d->data.entry_offset),
1800 le64toh(d->data.entry_array_offset),
1801 le64toh(d->data.n_entries),
1802 seqnum,
1803 test_object_seqnum,
1804 direction,
1805 ret, offset, NULL);
1806}
cec736d2 1807
de190aef
LP
1808int journal_file_move_to_entry_by_realtime_for_data(
1809 JournalFile *f,
1810 uint64_t data_offset,
1811 uint64_t realtime,
1812 direction_t direction,
1813 Object **ret, uint64_t *offset) {
1814
1815 Object *d;
1816 int r;
1817
91a31dde
LP
1818 assert(f);
1819
de190aef 1820 r = journal_file_move_to_object(f, OBJECT_DATA, data_offset, &d);
91a31dde 1821 if (r < 0)
de190aef
LP
1822 return r;
1823
1824 return generic_array_bisect_plus_one(f,
1825 le64toh(d->data.entry_offset),
1826 le64toh(d->data.entry_array_offset),
1827 le64toh(d->data.n_entries),
1828 realtime,
1829 test_object_realtime,
1830 direction,
1831 ret, offset, NULL);
cec736d2
LP
1832}
1833
1834void journal_file_dump(JournalFile *f) {
cec736d2
LP
1835 Object *o;
1836 int r;
1837 uint64_t p;
1838
1839 assert(f);
1840
dca6219e 1841 journal_file_print_header(f);
cec736d2 1842
23b0b2b2 1843 p = le64toh(f->header->header_size);
cec736d2 1844 while (p != 0) {
de190aef 1845 r = journal_file_move_to_object(f, -1, p, &o);
cec736d2
LP
1846 if (r < 0)
1847 goto fail;
1848
1849 switch (o->object.type) {
1850
1851 case OBJECT_UNUSED:
1852 printf("Type: OBJECT_UNUSED\n");
1853 break;
1854
1855 case OBJECT_DATA:
1856 printf("Type: OBJECT_DATA\n");
1857 break;
1858
1859 case OBJECT_ENTRY:
3fbf9cbb
LP
1860 printf("Type: OBJECT_ENTRY %llu %llu %llu\n",
1861 (unsigned long long) le64toh(o->entry.seqnum),
1862 (unsigned long long) le64toh(o->entry.monotonic),
1863 (unsigned long long) le64toh(o->entry.realtime));
cec736d2
LP
1864 break;
1865
de190aef
LP
1866 case OBJECT_FIELD_HASH_TABLE:
1867 printf("Type: OBJECT_FIELD_HASH_TABLE\n");
cec736d2
LP
1868 break;
1869
de190aef
LP
1870 case OBJECT_DATA_HASH_TABLE:
1871 printf("Type: OBJECT_DATA_HASH_TABLE\n");
1872 break;
1873
1874 case OBJECT_ENTRY_ARRAY:
1875 printf("Type: OBJECT_ENTRY_ARRAY\n");
cec736d2 1876 break;
8144056f
LP
1877
1878 case OBJECT_SIGNATURE:
1879 printf("Type: OBJECT_SIGNATURE\n");
1880 break;
cec736d2
LP
1881 }
1882
807e17f0
LP
1883 if (o->object.flags & OBJECT_COMPRESSED)
1884 printf("Flags: COMPRESSED\n");
1885
cec736d2
LP
1886 if (p == le64toh(f->header->tail_object_offset))
1887 p = 0;
1888 else
1889 p = p + ALIGN64(le64toh(o->object.size));
1890 }
1891
1892 return;
1893fail:
1894 log_error("File corrupt");
1895}
1896
dca6219e
LP
1897void journal_file_print_header(JournalFile *f) {
1898 char a[33], b[33], c[33];
1899 char x[FORMAT_TIMESTAMP_MAX], y[FORMAT_TIMESTAMP_MAX];
1900
1901 assert(f);
1902
1903 printf("File Path: %s\n"
1904 "File ID: %s\n"
1905 "Machine ID: %s\n"
1906 "Boot ID: %s\n"
1907 "Sequential Number ID: %s\n"
dc36ac67
LP
1908 "State: %s\n"
1909 "Compatible Flags:%s%s\n"
1910 "Incompatible Flags:%s%s\n"
dca6219e
LP
1911 "Header size: %llu\n"
1912 "Arena size: %llu\n"
1913 "Data Hash Table Size: %llu\n"
1914 "Field Hash Table Size: %llu\n"
1915 "Objects: %llu\n"
1916 "Entry Objects: %llu\n"
1917 "Rotate Suggested: %s\n"
1918 "Head Sequential Number: %llu\n"
1919 "Tail Sequential Number: %llu\n"
1920 "Head Realtime Timestamp: %s\n"
1921 "Tail Realtime Timestamp: %s\n",
1922 f->path,
1923 sd_id128_to_string(f->header->file_id, a),
1924 sd_id128_to_string(f->header->machine_id, b),
1925 sd_id128_to_string(f->header->boot_id, c),
1926 sd_id128_to_string(f->header->seqnum_id, c),
dc36ac67
LP
1927 f->header->state == STATE_OFFLINE ? "offline" :
1928 f->header->state == STATE_ONLINE ? "online" :
1929 f->header->state == STATE_ARCHIVED ? "archived" : "unknown",
1930 (f->header->compatible_flags & HEADER_COMPATIBLE_SIGNED) ? " SIGNED" : "",
1931 (f->header->compatible_flags & ~HEADER_COMPATIBLE_SIGNED) ? " ???" : "",
1932 (f->header->incompatible_flags & HEADER_INCOMPATIBLE_COMPRESSED) ? " COMPRESSED" : "",
1933 (f->header->incompatible_flags & ~HEADER_INCOMPATIBLE_COMPRESSED) ? " ???" : "",
dca6219e
LP
1934 (unsigned long long) le64toh(f->header->header_size),
1935 (unsigned long long) le64toh(f->header->arena_size),
1936 (unsigned long long) le64toh(f->header->data_hash_table_size) / sizeof(HashItem),
1937 (unsigned long long) le64toh(f->header->field_hash_table_size) / sizeof(HashItem),
1938 (unsigned long long) le64toh(f->header->n_objects),
1939 (unsigned long long) le64toh(f->header->n_entries),
1940 yes_no(journal_file_rotate_suggested(f)),
1941 (unsigned long long) le64toh(f->header->head_seqnum),
1942 (unsigned long long) le64toh(f->header->tail_seqnum),
1943 format_timestamp(x, sizeof(x), le64toh(f->header->head_entry_realtime)),
1944 format_timestamp(y, sizeof(y), le64toh(f->header->tail_entry_realtime)));
1945
1946 if (JOURNAL_HEADER_CONTAINS(f->header, n_data))
1947 printf("Data Objects: %llu\n"
1948 "Data Hash Table Fill: %.1f%%\n",
1949 (unsigned long long) le64toh(f->header->n_data),
1950 100.0 * (double) le64toh(f->header->n_data) / ((double) (le64toh(f->header->data_hash_table_size) / sizeof(HashItem))));
1951
1952 if (JOURNAL_HEADER_CONTAINS(f->header, n_fields))
1953 printf("Field Objects: %llu\n"
1954 "Field Hash Table Fill: %.1f%%\n",
1955 (unsigned long long) le64toh(f->header->n_fields),
1956 100.0 * (double) le64toh(f->header->n_fields) / ((double) (le64toh(f->header->field_hash_table_size) / sizeof(HashItem))));
1957}
1958
cec736d2
LP
1959int journal_file_open(
1960 const char *fname,
1961 int flags,
1962 mode_t mode,
4a92baf3 1963 JournalMetrics *metrics,
0ac38b70 1964 JournalFile *template,
cec736d2
LP
1965 JournalFile **ret) {
1966
1967 JournalFile *f;
1968 int r;
1969 bool newly_created = false;
1970
1971 assert(fname);
1972
1973 if ((flags & O_ACCMODE) != O_RDONLY &&
1974 (flags & O_ACCMODE) != O_RDWR)
1975 return -EINVAL;
1976
9447a7f1
LP
1977 if (!endswith(fname, ".journal"))
1978 return -EINVAL;
1979
cec736d2
LP
1980 f = new0(JournalFile, 1);
1981 if (!f)
1982 return -ENOMEM;
1983
0ac38b70
LP
1984 f->fd = -1;
1985 f->flags = flags;
1986 f->mode = mode;
cec736d2
LP
1987 f->writable = (flags & O_ACCMODE) != O_RDONLY;
1988 f->prot = prot_from_flags(flags);
1989
4a92baf3 1990 if (template)
15944db8 1991 f->compress = template->compress;
15944db8 1992
cec736d2
LP
1993 f->path = strdup(fname);
1994 if (!f->path) {
1995 r = -ENOMEM;
1996 goto fail;
1997 }
1998
0ac38b70
LP
1999 f->fd = open(f->path, f->flags|O_CLOEXEC, f->mode);
2000 if (f->fd < 0) {
2001 r = -errno;
2002 goto fail;
2003 }
2004
cec736d2
LP
2005 if (fstat(f->fd, &f->last_stat) < 0) {
2006 r = -errno;
2007 goto fail;
2008 }
2009
2010 if (f->last_stat.st_size == 0 && f->writable) {
2011 newly_created = true;
2012
0ac38b70 2013 r = journal_file_init_header(f, template);
cec736d2
LP
2014 if (r < 0)
2015 goto fail;
2016
2017 if (fstat(f->fd, &f->last_stat) < 0) {
2018 r = -errno;
2019 goto fail;
2020 }
2021 }
2022
dca6219e 2023 if (f->last_stat.st_size < (off_t) HEADER_SIZE_MIN) {
cec736d2
LP
2024 r = -EIO;
2025 goto fail;
2026 }
2027
2028 f->header = mmap(NULL, PAGE_ALIGN(sizeof(Header)), prot_from_flags(flags), MAP_SHARED, f->fd, 0);
2029 if (f->header == MAP_FAILED) {
2030 f->header = NULL;
2031 r = -errno;
2032 goto fail;
2033 }
2034
2035 if (!newly_created) {
2036 r = journal_file_verify_header(f);
2037 if (r < 0)
2038 goto fail;
2039 }
2040
2041 if (f->writable) {
4a92baf3
LP
2042 if (metrics) {
2043 journal_default_metrics(metrics, f->fd);
2044 f->metrics = *metrics;
2045 } else if (template)
2046 f->metrics = template->metrics;
2047
cec736d2
LP
2048 r = journal_file_refresh_header(f);
2049 if (r < 0)
2050 goto fail;
2051 }
2052
2053 if (newly_created) {
2054
de190aef 2055 r = journal_file_setup_field_hash_table(f);
cec736d2
LP
2056 if (r < 0)
2057 goto fail;
2058
de190aef 2059 r = journal_file_setup_data_hash_table(f);
cec736d2
LP
2060 if (r < 0)
2061 goto fail;
2062 }
2063
de190aef 2064 r = journal_file_map_field_hash_table(f);
cec736d2
LP
2065 if (r < 0)
2066 goto fail;
2067
de190aef 2068 r = journal_file_map_data_hash_table(f);
cec736d2
LP
2069 if (r < 0)
2070 goto fail;
2071
2072 if (ret)
2073 *ret = f;
2074
2075 return 0;
2076
2077fail:
2078 journal_file_close(f);
2079
2080 return r;
2081}
0ac38b70
LP
2082
2083int journal_file_rotate(JournalFile **f) {
2084 char *p;
2085 size_t l;
2086 JournalFile *old_file, *new_file = NULL;
2087 int r;
2088
2089 assert(f);
2090 assert(*f);
2091
2092 old_file = *f;
2093
2094 if (!old_file->writable)
2095 return -EINVAL;
2096
2097 if (!endswith(old_file->path, ".journal"))
2098 return -EINVAL;
2099
2100 l = strlen(old_file->path);
2101
9447a7f1 2102 p = new(char, l + 1 + 32 + 1 + 16 + 1 + 16 + 1);
0ac38b70
LP
2103 if (!p)
2104 return -ENOMEM;
2105
2106 memcpy(p, old_file->path, l - 8);
2107 p[l-8] = '@';
2108 sd_id128_to_string(old_file->header->seqnum_id, p + l - 8 + 1);
2109 snprintf(p + l - 8 + 1 + 32, 1 + 16 + 1 + 16 + 8 + 1,
2110 "-%016llx-%016llx.journal",
dca6219e 2111 (unsigned long long) le64toh((*f)->header->tail_seqnum),
0ac38b70
LP
2112 (unsigned long long) le64toh((*f)->header->tail_entry_realtime));
2113
2114 r = rename(old_file->path, p);
2115 free(p);
2116
2117 if (r < 0)
2118 return -errno;
2119
ccdbaf91 2120 old_file->header->state = STATE_ARCHIVED;
0ac38b70 2121
4a92baf3 2122 r = journal_file_open(old_file->path, old_file->flags, old_file->mode, NULL, old_file, &new_file);
0ac38b70
LP
2123 journal_file_close(old_file);
2124
2125 *f = new_file;
2126 return r;
2127}
2128
9447a7f1
LP
2129int journal_file_open_reliably(
2130 const char *fname,
2131 int flags,
2132 mode_t mode,
4a92baf3 2133 JournalMetrics *metrics,
9447a7f1
LP
2134 JournalFile *template,
2135 JournalFile **ret) {
2136
2137 int r;
2138 size_t l;
2139 char *p;
2140
4a92baf3 2141 r = journal_file_open(fname, flags, mode, metrics, template, ret);
0071d9f1
LP
2142 if (r != -EBADMSG && /* corrupted */
2143 r != -ENODATA && /* truncated */
2144 r != -EHOSTDOWN && /* other machine */
2145 r != -EPROTONOSUPPORT) /* incompatible feature */
9447a7f1
LP
2146 return r;
2147
2148 if ((flags & O_ACCMODE) == O_RDONLY)
2149 return r;
2150
2151 if (!(flags & O_CREAT))
2152 return r;
2153
5c70eab4
LP
2154 /* The file is corrupted. Rotate it away and try it again (but only once) */
2155
9447a7f1
LP
2156 l = strlen(fname);
2157 if (asprintf(&p, "%.*s@%016llx-%016llx.journal~",
2158 (int) (l-8), fname,
2159 (unsigned long long) now(CLOCK_REALTIME),
2160 random_ull()) < 0)
2161 return -ENOMEM;
2162
2163 r = rename(fname, p);
2164 free(p);
2165 if (r < 0)
2166 return -errno;
2167
2168 log_warning("File %s corrupted, renaming and replacing.", fname);
2169
4a92baf3 2170 return journal_file_open(fname, flags, mode, metrics, template, ret);
9447a7f1
LP
2171}
2172
0ac38b70
LP
2173struct vacuum_info {
2174 off_t usage;
2175 char *filename;
2176
2177 uint64_t realtime;
2178 sd_id128_t seqnum_id;
2179 uint64_t seqnum;
5c70eab4
LP
2180
2181 bool have_seqnum;
0ac38b70
LP
2182};
2183
2184static int vacuum_compare(const void *_a, const void *_b) {
2185 const struct vacuum_info *a, *b;
2186
2187 a = _a;
2188 b = _b;
2189
5c70eab4
LP
2190 if (a->have_seqnum && b->have_seqnum &&
2191 sd_id128_equal(a->seqnum_id, b->seqnum_id)) {
0ac38b70
LP
2192 if (a->seqnum < b->seqnum)
2193 return -1;
2194 else if (a->seqnum > b->seqnum)
2195 return 1;
2196 else
2197 return 0;
2198 }
2199
2200 if (a->realtime < b->realtime)
2201 return -1;
2202 else if (a->realtime > b->realtime)
2203 return 1;
5c70eab4 2204 else if (a->have_seqnum && b->have_seqnum)
0ac38b70 2205 return memcmp(&a->seqnum_id, &b->seqnum_id, 16);
5c70eab4
LP
2206 else
2207 return strcmp(a->filename, b->filename);
0ac38b70
LP
2208}
2209
2210int journal_directory_vacuum(const char *directory, uint64_t max_use, uint64_t min_free) {
2211 DIR *d;
2212 int r = 0;
2213 struct vacuum_info *list = NULL;
2214 unsigned n_list = 0, n_allocated = 0, i;
2215 uint64_t sum = 0;
2216
2217 assert(directory);
2218
2219 if (max_use <= 0)
babfc091 2220 return 0;
0ac38b70
LP
2221
2222 d = opendir(directory);
2223 if (!d)
2224 return -errno;
2225
2226 for (;;) {
2227 int k;
2228 struct dirent buf, *de;
2229 size_t q;
2230 struct stat st;
2231 char *p;
7ea07dcd 2232 unsigned long long seqnum = 0, realtime;
0ac38b70 2233 sd_id128_t seqnum_id;
5c70eab4 2234 bool have_seqnum;
0ac38b70
LP
2235
2236 k = readdir_r(d, &buf, &de);
2237 if (k != 0) {
2238 r = -k;
2239 goto finish;
2240 }
2241
2242 if (!de)
2243 break;
2244
5c70eab4
LP
2245 if (fstatat(dirfd(d), de->d_name, &st, AT_SYMLINK_NOFOLLOW) < 0)
2246 continue;
2247
2248 if (!S_ISREG(st.st_mode))
0ac38b70
LP
2249 continue;
2250
2251 q = strlen(de->d_name);
2252
5c70eab4 2253 if (endswith(de->d_name, ".journal")) {
0ac38b70 2254
5c70eab4 2255 /* Vacuum archived files */
0ac38b70 2256
5c70eab4
LP
2257 if (q < 1 + 32 + 1 + 16 + 1 + 16 + 8)
2258 continue;
0ac38b70 2259
5c70eab4
LP
2260 if (de->d_name[q-8-16-1] != '-' ||
2261 de->d_name[q-8-16-1-16-1] != '-' ||
2262 de->d_name[q-8-16-1-16-1-32-1] != '@')
2263 continue;
0ac38b70 2264
5c70eab4
LP
2265 p = strdup(de->d_name);
2266 if (!p) {
2267 r = -ENOMEM;
2268 goto finish;
2269 }
0ac38b70 2270
5c70eab4
LP
2271 de->d_name[q-8-16-1-16-1] = 0;
2272 if (sd_id128_from_string(de->d_name + q-8-16-1-16-1-32, &seqnum_id) < 0) {
2273 free(p);
2274 continue;
2275 }
2276
2277 if (sscanf(de->d_name + q-8-16-1-16, "%16llx-%16llx.journal", &seqnum, &realtime) != 2) {
2278 free(p);
2279 continue;
2280 }
2281
2282 have_seqnum = true;
2283
2284 } else if (endswith(de->d_name, ".journal~")) {
2285 unsigned long long tmp;
2286
2287 /* Vacuum corrupted files */
2288
2289 if (q < 1 + 16 + 1 + 16 + 8 + 1)
2290 continue;
0ac38b70 2291
5c70eab4
LP
2292 if (de->d_name[q-1-8-16-1] != '-' ||
2293 de->d_name[q-1-8-16-1-16-1] != '@')
2294 continue;
2295
2296 p = strdup(de->d_name);
2297 if (!p) {
2298 r = -ENOMEM;
2299 goto finish;
2300 }
2301
2302 if (sscanf(de->d_name + q-1-8-16-1-16, "%16llx-%16llx.journal~", &realtime, &tmp) != 2) {
2303 free(p);
2304 continue;
2305 }
2306
2307 have_seqnum = false;
2308 } else
0ac38b70 2309 continue;
0ac38b70
LP
2310
2311 if (n_list >= n_allocated) {
2312 struct vacuum_info *j;
2313
2314 n_allocated = MAX(n_allocated * 2U, 8U);
2315 j = realloc(list, n_allocated * sizeof(struct vacuum_info));
2316 if (!j) {
2317 free(p);
2318 r = -ENOMEM;
2319 goto finish;
2320 }
2321
2322 list = j;
2323 }
2324
2325 list[n_list].filename = p;
a3a52c0f 2326 list[n_list].usage = 512UL * (uint64_t) st.st_blocks;
0ac38b70
LP
2327 list[n_list].seqnum = seqnum;
2328 list[n_list].realtime = realtime;
2329 list[n_list].seqnum_id = seqnum_id;
5c70eab4 2330 list[n_list].have_seqnum = have_seqnum;
0ac38b70
LP
2331
2332 sum += list[n_list].usage;
2333
2334 n_list ++;
2335 }
2336
2337 qsort(list, n_list, sizeof(struct vacuum_info), vacuum_compare);
2338
2339 for(i = 0; i < n_list; i++) {
2340 struct statvfs ss;
2341
2342 if (fstatvfs(dirfd(d), &ss) < 0) {
2343 r = -errno;
2344 goto finish;
2345 }
2346
2347 if (sum <= max_use &&
2348 (uint64_t) ss.f_bavail * (uint64_t) ss.f_bsize >= min_free)
2349 break;
2350
2351 if (unlinkat(dirfd(d), list[i].filename, 0) >= 0) {
e7bf07b3 2352 log_info("Deleted archived journal %s/%s.", directory, list[i].filename);
0ac38b70
LP
2353 sum -= list[i].usage;
2354 } else if (errno != ENOENT)
2355 log_warning("Failed to delete %s/%s: %m", directory, list[i].filename);
2356 }
2357
2358finish:
2359 for (i = 0; i < n_list; i++)
2360 free(list[i].filename);
2361
2362 free(list);
2363
de190aef
LP
2364 if (d)
2365 closedir(d);
2366
0ac38b70
LP
2367 return r;
2368}
cf244689
LP
2369
2370int journal_file_copy_entry(JournalFile *from, JournalFile *to, Object *o, uint64_t p, uint64_t *seqnum, Object **ret, uint64_t *offset) {
2371 uint64_t i, n;
2372 uint64_t q, xor_hash = 0;
2373 int r;
2374 EntryItem *items;
2375 dual_timestamp ts;
2376
2377 assert(from);
2378 assert(to);
2379 assert(o);
2380 assert(p);
2381
2382 if (!to->writable)
2383 return -EPERM;
2384
2385 ts.monotonic = le64toh(o->entry.monotonic);
2386 ts.realtime = le64toh(o->entry.realtime);
2387
2388 if (to->tail_entry_monotonic_valid &&
2389 ts.monotonic < le64toh(to->header->tail_entry_monotonic))
2390 return -EINVAL;
2391
cf244689
LP
2392 n = journal_file_entry_n_items(o);
2393 items = alloca(sizeof(EntryItem) * n);
2394
2395 for (i = 0; i < n; i++) {
4fd052ae
FC
2396 uint64_t l, h;
2397 le64_t le_hash;
cf244689
LP
2398 size_t t;
2399 void *data;
2400 Object *u;
2401
2402 q = le64toh(o->entry.items[i].object_offset);
2403 le_hash = o->entry.items[i].hash;
2404
2405 r = journal_file_move_to_object(from, OBJECT_DATA, q, &o);
2406 if (r < 0)
2407 return r;
2408
2409 if (le_hash != o->data.hash)
2410 return -EBADMSG;
2411
2412 l = le64toh(o->object.size) - offsetof(Object, data.payload);
2413 t = (size_t) l;
2414
2415 /* We hit the limit on 32bit machines */
2416 if ((uint64_t) t != l)
2417 return -E2BIG;
2418
2419 if (o->object.flags & OBJECT_COMPRESSED) {
2420#ifdef HAVE_XZ
2421 uint64_t rsize;
2422
2423 if (!uncompress_blob(o->data.payload, l, &from->compress_buffer, &from->compress_buffer_size, &rsize))
2424 return -EBADMSG;
2425
2426 data = from->compress_buffer;
2427 l = rsize;
2428#else
2429 return -EPROTONOSUPPORT;
2430#endif
2431 } else
2432 data = o->data.payload;
2433
2434 r = journal_file_append_data(to, data, l, &u, &h);
2435 if (r < 0)
2436 return r;
2437
2438 xor_hash ^= le64toh(u->data.hash);
2439 items[i].object_offset = htole64(h);
2440 items[i].hash = u->data.hash;
2441
2442 r = journal_file_move_to_object(from, OBJECT_ENTRY, p, &o);
2443 if (r < 0)
2444 return r;
2445 }
2446
2447 return journal_file_append_entry_internal(to, &ts, xor_hash, items, n, seqnum, ret, offset);
2448}
babfc091
LP
2449
2450void journal_default_metrics(JournalMetrics *m, int fd) {
2451 uint64_t fs_size = 0;
2452 struct statvfs ss;
a7bc2c2a 2453 char a[FORMAT_BYTES_MAX], b[FORMAT_BYTES_MAX], c[FORMAT_BYTES_MAX], d[FORMAT_BYTES_MAX];
babfc091
LP
2454
2455 assert(m);
2456 assert(fd >= 0);
2457
2458 if (fstatvfs(fd, &ss) >= 0)
2459 fs_size = ss.f_frsize * ss.f_blocks;
2460
2461 if (m->max_use == (uint64_t) -1) {
2462
2463 if (fs_size > 0) {
2464 m->max_use = PAGE_ALIGN(fs_size / 10); /* 10% of file system size */
2465
2466 if (m->max_use > DEFAULT_MAX_USE_UPPER)
2467 m->max_use = DEFAULT_MAX_USE_UPPER;
2468
2469 if (m->max_use < DEFAULT_MAX_USE_LOWER)
2470 m->max_use = DEFAULT_MAX_USE_LOWER;
2471 } else
2472 m->max_use = DEFAULT_MAX_USE_LOWER;
2473 } else {
2474 m->max_use = PAGE_ALIGN(m->max_use);
2475
2476 if (m->max_use < JOURNAL_FILE_SIZE_MIN*2)
2477 m->max_use = JOURNAL_FILE_SIZE_MIN*2;
2478 }
2479
2480 if (m->max_size == (uint64_t) -1) {
2481 m->max_size = PAGE_ALIGN(m->max_use / 8); /* 8 chunks */
2482
2483 if (m->max_size > DEFAULT_MAX_SIZE_UPPER)
2484 m->max_size = DEFAULT_MAX_SIZE_UPPER;
2485 } else
2486 m->max_size = PAGE_ALIGN(m->max_size);
2487
2488 if (m->max_size < JOURNAL_FILE_SIZE_MIN)
2489 m->max_size = JOURNAL_FILE_SIZE_MIN;
2490
2491 if (m->max_size*2 > m->max_use)
2492 m->max_use = m->max_size*2;
2493
2494 if (m->min_size == (uint64_t) -1)
2495 m->min_size = JOURNAL_FILE_SIZE_MIN;
2496 else {
2497 m->min_size = PAGE_ALIGN(m->min_size);
2498
2499 if (m->min_size < JOURNAL_FILE_SIZE_MIN)
2500 m->min_size = JOURNAL_FILE_SIZE_MIN;
2501
2502 if (m->min_size > m->max_size)
2503 m->max_size = m->min_size;
2504 }
2505
2506 if (m->keep_free == (uint64_t) -1) {
2507
2508 if (fs_size > 0) {
2509 m->keep_free = PAGE_ALIGN(fs_size / 20); /* 5% of file system size */
2510
2511 if (m->keep_free > DEFAULT_KEEP_FREE_UPPER)
2512 m->keep_free = DEFAULT_KEEP_FREE_UPPER;
2513
2514 } else
2515 m->keep_free = DEFAULT_KEEP_FREE;
2516 }
2517
e7bf07b3
LP
2518 log_info("Fixed max_use=%s max_size=%s min_size=%s keep_free=%s",
2519 format_bytes(a, sizeof(a), m->max_use),
2520 format_bytes(b, sizeof(b), m->max_size),
2521 format_bytes(c, sizeof(c), m->min_size),
2522 format_bytes(d, sizeof(d), m->keep_free));
babfc091 2523}
08984293
LP
2524
2525int journal_file_get_cutoff_realtime_usec(JournalFile *f, usec_t *from, usec_t *to) {
08984293
LP
2526 assert(f);
2527 assert(from || to);
2528
2529 if (from) {
162566a4
LP
2530 if (f->header->head_entry_realtime == 0)
2531 return -ENOENT;
08984293 2532
162566a4 2533 *from = le64toh(f->header->head_entry_realtime);
08984293
LP
2534 }
2535
2536 if (to) {
162566a4
LP
2537 if (f->header->tail_entry_realtime == 0)
2538 return -ENOENT;
08984293 2539
162566a4 2540 *to = le64toh(f->header->tail_entry_realtime);
08984293
LP
2541 }
2542
2543 return 1;
2544}
2545
2546int journal_file_get_cutoff_monotonic_usec(JournalFile *f, sd_id128_t boot_id, usec_t *from, usec_t *to) {
2547 char t[9+32+1] = "_BOOT_ID=";
2548 Object *o;
2549 uint64_t p;
2550 int r;
2551
2552 assert(f);
2553 assert(from || to);
2554
2555 sd_id128_to_string(boot_id, t + 9);
2556
2557 r = journal_file_find_data_object(f, t, strlen(t), &o, &p);
2558 if (r <= 0)
2559 return r;
2560
2561 if (le64toh(o->data.n_entries) <= 0)
2562 return 0;
2563
2564 if (from) {
2565 r = journal_file_move_to_object(f, OBJECT_ENTRY, le64toh(o->data.entry_offset), &o);
2566 if (r < 0)
2567 return r;
2568
2569 *from = le64toh(o->entry.monotonic);
2570 }
2571
2572 if (to) {
2573 r = journal_file_move_to_object(f, OBJECT_DATA, p, &o);
2574 if (r < 0)
2575 return r;
2576
2577 r = generic_array_get_plus_one(f,
2578 le64toh(o->data.entry_offset),
2579 le64toh(o->data.entry_array_offset),
2580 le64toh(o->data.n_entries)-1,
2581 &o, NULL);
2582 if (r <= 0)
2583 return r;
2584
2585 *to = le64toh(o->entry.monotonic);
2586 }
2587
2588 return 1;
2589}
dca6219e
LP
2590
2591bool journal_file_rotate_suggested(JournalFile *f) {
2592 assert(f);
2593
2594 /* If we gained new header fields we gained new features,
2595 * hence suggest a rotation */
2596 if (le64toh(f->header->header_size) < sizeof(Header))
2597 return true;
2598
2599 /* Let's check if the hash tables grew over a certain fill
2600 * level (75%, borrowing this value from Java's hash table
2601 * implementation), and if so suggest a rotation. To calculate
2602 * the fill level we need the n_data field, which only exists
2603 * in newer versions. */
2604
2605 if (JOURNAL_HEADER_CONTAINS(f->header, n_data))
2606 if (le64toh(f->header->n_data) * 4ULL > (le64toh(f->header->data_hash_table_size) / sizeof(HashItem)) * 3ULL)
2607 return true;
2608
2609 if (JOURNAL_HEADER_CONTAINS(f->header, n_fields))
2610 if (le64toh(f->header->n_fields) * 4ULL > (le64toh(f->header->field_hash_table_size) / sizeof(HashItem)) * 3ULL)
2611 return true;
2612
2613 return false;
2614}