]> git.ipfire.org Git - thirdparty/systemd.git/blame - src/journal/journal-file.c
nspawn: explicitly remove veth links after use (#3111)
[thirdparty/systemd.git] / src / journal / journal-file.c
CommitLineData
cec736d2
LP
1/***
2 This file is part of systemd.
3
4 Copyright 2011 Lennart Poettering
5
6 systemd is free software; you can redistribute it and/or modify it
5430f7f2
LP
7 under the terms of the GNU Lesser General Public License as published by
8 the Free Software Foundation; either version 2.1 of the License, or
cec736d2
LP
9 (at your option) any later version.
10
11 systemd is distributed in the hope that it will be useful, but
12 WITHOUT ANY WARRANTY; without even the implied warranty of
13 MERCHANTABILITY or FITNESS FOR A PARTICULAR PURPOSE. See the GNU
5430f7f2 14 Lesser General Public License for more details.
cec736d2 15
5430f7f2 16 You should have received a copy of the GNU Lesser General Public License
cec736d2
LP
17 along with systemd; If not, see <http://www.gnu.org/licenses/>.
18***/
19
cec736d2 20#include <errno.h>
cec736d2 21#include <fcntl.h>
11689d2a 22#include <linux/fs.h>
ac2e41f5 23#include <pthread.h>
07630cea
LP
24#include <stddef.h>
25#include <sys/mman.h>
26#include <sys/statvfs.h>
27#include <sys/uio.h>
28#include <unistd.h>
fb0951b0 29
b5efdb8a 30#include "alloc-util.h"
f27a3864 31#include "btrfs-util.h"
c8b3094d 32#include "chattr-util.h"
07630cea 33#include "compress.h"
3ffd4af2 34#include "fd-util.h"
0284adc6 35#include "journal-authenticate.h"
cec736d2
LP
36#include "journal-def.h"
37#include "journal-file.h"
38#include "lookup3.h"
6bedfcbb 39#include "parse-util.h"
3df3e884 40#include "random-util.h"
7a24f3bf 41#include "sd-event.h"
b58c888f 42#include "set.h"
07630cea 43#include "string-util.h"
89a5a90c 44#include "xattr-util.h"
cec736d2 45
4a92baf3
LP
46#define DEFAULT_DATA_HASH_TABLE_SIZE (2047ULL*sizeof(HashItem))
47#define DEFAULT_FIELD_HASH_TABLE_SIZE (333ULL*sizeof(HashItem))
cec736d2 48
be19b7df 49#define COMPRESSION_SIZE_THRESHOLD (512ULL)
807e17f0 50
babfc091 51/* This is the minimum journal file size */
16098e93 52#define JOURNAL_FILE_SIZE_MIN (512ULL*1024ULL) /* 512 KiB */
babfc091
LP
53
54/* These are the lower and upper bounds if we deduce the max_use value
55 * from the file system size */
56#define DEFAULT_MAX_USE_LOWER (1ULL*1024ULL*1024ULL) /* 1 MiB */
57#define DEFAULT_MAX_USE_UPPER (4ULL*1024ULL*1024ULL*1024ULL) /* 4 GiB */
58
8580d1f7
LP
59/* This is the default minimal use limit, how much we'll use even if keep_free suggests otherwise. */
60#define DEFAULT_MIN_USE (1ULL*1024ULL*1024ULL) /* 1 MiB */
61
babfc091 62/* This is the upper bound if we deduce max_size from max_use */
71100051 63#define DEFAULT_MAX_SIZE_UPPER (128ULL*1024ULL*1024ULL) /* 128 MiB */
babfc091
LP
64
65/* This is the upper bound if we deduce the keep_free value from the
66 * file system size */
67#define DEFAULT_KEEP_FREE_UPPER (4ULL*1024ULL*1024ULL*1024ULL) /* 4 GiB */
68
69/* This is the keep_free value when we can't determine the system
70 * size */
71#define DEFAULT_KEEP_FREE (1024ULL*1024ULL) /* 1 MB */
72
8580d1f7
LP
73/* This is the default maximum number of journal files to keep around. */
74#define DEFAULT_N_MAX_FILES (100)
75
dca6219e
LP
76/* n_data was the first entry we added after the initial file format design */
77#define HEADER_SIZE_MIN ALIGN64(offsetof(Header, n_data))
cec736d2 78
a4bcff5b
LP
79/* How many entries to keep in the entry array chain cache at max */
80#define CHAIN_CACHE_MAX 20
81
a676e665
LP
82/* How much to increase the journal file size at once each time we allocate something new. */
83#define FILE_SIZE_INCREASE (8ULL*1024ULL*1024ULL) /* 8MB */
84
2678031a
LP
85/* Reread fstat() of the file for detecting deletions at least this often */
86#define LAST_STAT_REFRESH_USEC (5*USEC_PER_SEC)
87
fa6ac760
LP
88/* The mmap context to use for the header we pick as one above the last defined typed */
89#define CONTEXT_HEADER _OBJECT_TYPE_MAX
90
ac2e41f5
VC
91/* This may be called from a separate thread to prevent blocking the caller for the duration of fsync().
92 * As a result we use atomic operations on f->offline_state for inter-thread communications with
93 * journal_file_set_offline() and journal_file_set_online(). */
94static void journal_file_set_offline_internal(JournalFile *f) {
26687bf8 95 assert(f);
ac2e41f5
VC
96 assert(f->fd >= 0);
97 assert(f->header);
98
99 for (;;) {
100 switch (f->offline_state) {
101 case OFFLINE_CANCEL:
102 if (!__sync_bool_compare_and_swap(&f->offline_state, OFFLINE_CANCEL, OFFLINE_DONE))
103 continue;
104 return;
105
106 case OFFLINE_AGAIN_FROM_SYNCING:
107 if (!__sync_bool_compare_and_swap(&f->offline_state, OFFLINE_AGAIN_FROM_SYNCING, OFFLINE_SYNCING))
108 continue;
109 break;
110
111 case OFFLINE_AGAIN_FROM_OFFLINING:
112 if (!__sync_bool_compare_and_swap(&f->offline_state, OFFLINE_AGAIN_FROM_OFFLINING, OFFLINE_SYNCING))
113 continue;
114 break;
115
116 case OFFLINE_SYNCING:
117 (void) fsync(f->fd);
26687bf8 118
ac2e41f5
VC
119 if (!__sync_bool_compare_and_swap(&f->offline_state, OFFLINE_SYNCING, OFFLINE_OFFLINING))
120 continue;
26687bf8 121
ac2e41f5
VC
122 f->header->state = STATE_OFFLINE;
123 (void) fsync(f->fd);
124 break;
125
126 case OFFLINE_OFFLINING:
127 if (!__sync_bool_compare_and_swap(&f->offline_state, OFFLINE_OFFLINING, OFFLINE_DONE))
128 continue;
129 /* fall through */
130
131 case OFFLINE_DONE:
132 return;
133
134 case OFFLINE_JOINED:
135 log_debug("OFFLINE_JOINED unexpected offline state for journal_file_set_offline_internal()");
136 return;
137 }
138 }
139}
140
141static void * journal_file_set_offline_thread(void *arg) {
142 JournalFile *f = arg;
143
144 journal_file_set_offline_internal(f);
145
146 return NULL;
147}
148
149static int journal_file_set_offline_thread_join(JournalFile *f) {
150 int r;
151
152 assert(f);
153
154 if (f->offline_state == OFFLINE_JOINED)
155 return 0;
156
157 r = pthread_join(f->offline_thread, NULL);
158 if (r)
159 return -r;
160
161 f->offline_state = OFFLINE_JOINED;
26687bf8 162
fa6ac760
LP
163 if (mmap_cache_got_sigbus(f->mmap, f->fd))
164 return -EIO;
165
ac2e41f5
VC
166 return 0;
167}
26687bf8 168
ac2e41f5
VC
169/* Trigger a restart if the offline thread is mid-flight in a restartable state. */
170static bool journal_file_set_offline_try_restart(JournalFile *f) {
171 for (;;) {
172 switch (f->offline_state) {
173 case OFFLINE_AGAIN_FROM_SYNCING:
174 case OFFLINE_AGAIN_FROM_OFFLINING:
175 return true;
176
177 case OFFLINE_CANCEL:
178 if (!__sync_bool_compare_and_swap(&f->offline_state, OFFLINE_CANCEL, OFFLINE_AGAIN_FROM_SYNCING))
179 continue;
180 return true;
181
182 case OFFLINE_SYNCING:
183 if (!__sync_bool_compare_and_swap(&f->offline_state, OFFLINE_SYNCING, OFFLINE_AGAIN_FROM_SYNCING))
184 continue;
185 return true;
186
187 case OFFLINE_OFFLINING:
188 if (!__sync_bool_compare_and_swap(&f->offline_state, OFFLINE_OFFLINING, OFFLINE_AGAIN_FROM_OFFLINING))
189 continue;
190 return true;
26687bf8
OS
191
192 default:
ac2e41f5
VC
193 return false;
194 }
26687bf8
OS
195 }
196}
197
ac2e41f5
VC
198/* Sets a journal offline.
199 *
200 * If wait is false then an offline is dispatched in a separate thread for a
201 * subsequent journal_file_set_offline() or journal_file_set_online() of the
202 * same journal to synchronize with.
203 *
204 * If wait is true, then either an existing offline thread will be restarted
205 * and joined, or if none exists the offline is simply performed in this
206 * context without involving another thread.
207 */
208int journal_file_set_offline(JournalFile *f, bool wait) {
209 bool restarted;
210 int r;
211
26687bf8
OS
212 assert(f);
213
214 if (!f->writable)
215 return -EPERM;
216
217 if (!(f->fd >= 0 && f->header))
218 return -EINVAL;
219
220 if (f->header->state != STATE_ONLINE)
221 return 0;
222
ac2e41f5
VC
223 /* Restart an in-flight offline thread and wait if needed, or join a lingering done one. */
224 restarted = journal_file_set_offline_try_restart(f);
225 if ((restarted && wait) || !restarted) {
226 r = journal_file_set_offline_thread_join(f);
227 if (r < 0)
228 return r;
229 }
26687bf8 230
ac2e41f5
VC
231 if (restarted)
232 return 0;
233
234 /* Initiate a new offline. */
235 f->offline_state = OFFLINE_SYNCING;
fa6ac760 236
ac2e41f5
VC
237 if (wait) /* Without using a thread if waiting. */
238 journal_file_set_offline_internal(f);
239 else {
240 r = pthread_create(&f->offline_thread, NULL, journal_file_set_offline_thread, f);
ec9ffa2c
VC
241 if (r > 0) {
242 f->offline_state = OFFLINE_JOINED;
ac2e41f5 243 return -r;
ec9ffa2c 244 }
ac2e41f5
VC
245 }
246
247 return 0;
248}
249
250static int journal_file_set_online(JournalFile *f) {
251 bool joined = false;
252
253 assert(f);
254
255 if (!f->writable)
256 return -EPERM;
257
258 if (!(f->fd >= 0 && f->header))
259 return -EINVAL;
260
261 while (!joined) {
262 switch (f->offline_state) {
263 case OFFLINE_JOINED:
264 /* No offline thread, no need to wait. */
265 joined = true;
266 break;
267
268 case OFFLINE_SYNCING:
269 if (!__sync_bool_compare_and_swap(&f->offline_state, OFFLINE_SYNCING, OFFLINE_CANCEL))
270 continue;
271 /* Canceled syncing prior to offlining, no need to wait. */
272 break;
273
274 case OFFLINE_AGAIN_FROM_SYNCING:
275 if (!__sync_bool_compare_and_swap(&f->offline_state, OFFLINE_AGAIN_FROM_SYNCING, OFFLINE_CANCEL))
276 continue;
277 /* Canceled restart from syncing, no need to wait. */
278 break;
279
280 case OFFLINE_AGAIN_FROM_OFFLINING:
281 if (!__sync_bool_compare_and_swap(&f->offline_state, OFFLINE_AGAIN_FROM_OFFLINING, OFFLINE_CANCEL))
282 continue;
283 /* Canceled restart from offlining, must wait for offlining to complete however. */
284
285 /* fall through to wait */
286 default: {
287 int r;
288
289 r = journal_file_set_offline_thread_join(f);
290 if (r < 0)
291 return r;
292
293 joined = true;
294 break;
295 }
296 }
297 }
26687bf8 298
fa6ac760
LP
299 if (mmap_cache_got_sigbus(f->mmap, f->fd))
300 return -EIO;
301
ac2e41f5
VC
302 switch (f->header->state) {
303 case STATE_ONLINE:
304 return 0;
26687bf8 305
ac2e41f5
VC
306 case STATE_OFFLINE:
307 f->header->state = STATE_ONLINE;
308 (void) fsync(f->fd);
309 return 0;
310
311 default:
312 return -EINVAL;
313 }
26687bf8
OS
314}
315
b58c888f
VC
316bool journal_file_is_offlining(JournalFile *f) {
317 assert(f);
318
319 __sync_synchronize();
320
321 if (f->offline_state == OFFLINE_DONE ||
322 f->offline_state == OFFLINE_JOINED)
323 return false;
324
325 return true;
326}
327
804ae586 328JournalFile* journal_file_close(JournalFile *f) {
de190aef 329 assert(f);
cec736d2 330
feb12d3e 331#ifdef HAVE_GCRYPT
b0af6f41 332 /* Write the final tag */
c586dbf1 333 if (f->seal && f->writable)
b0af6f41 334 journal_file_append_tag(f);
feb12d3e 335#endif
b0af6f41 336
7a24f3bf
VC
337 if (f->post_change_timer) {
338 int enabled;
339
340 if (sd_event_source_get_enabled(f->post_change_timer, &enabled) >= 0)
341 if (enabled == SD_EVENT_ONESHOT)
342 journal_file_post_change(f);
343
e167d7fd 344 (void) sd_event_source_set_enabled(f->post_change_timer, SD_EVENT_OFF);
7a24f3bf
VC
345 sd_event_source_unref(f->post_change_timer);
346 }
347
ac2e41f5 348 journal_file_set_offline(f, true);
cec736d2 349
fa6ac760
LP
350 if (f->mmap && f->fd >= 0)
351 mmap_cache_close_fd(f->mmap, f->fd);
cec736d2 352
11689d2a
LP
353 if (f->fd >= 0 && f->defrag_on_close) {
354
355 /* Be friendly to btrfs: turn COW back on again now,
356 * and defragment the file. We won't write to the file
357 * ever again, hence remove all fragmentation, and
358 * reenable all the good bits COW usually provides
359 * (such as data checksumming). */
360
1ed8f8c1 361 (void) chattr_fd(f->fd, 0, FS_NOCOW_FL);
11689d2a
LP
362 (void) btrfs_defrag_fd(f->fd);
363 }
f27a3864 364
03e334a1 365 safe_close(f->fd);
cec736d2 366 free(f->path);
807e17f0 367
f649045c 368 mmap_cache_unref(f->mmap);
16e9f408 369
4743015d 370 ordered_hashmap_free_free(f->chain_cache);
a4bcff5b 371
d89c8fdf 372#if defined(HAVE_XZ) || defined(HAVE_LZ4)
807e17f0
LP
373 free(f->compress_buffer);
374#endif
375
7560fffc 376#ifdef HAVE_GCRYPT
baed47c3
LP
377 if (f->fss_file)
378 munmap(f->fss_file, PAGE_ALIGN(f->fss_file_size));
dc4ebc07 379 else
b7c9ae91
LP
380 free(f->fsprg_state);
381
382 free(f->fsprg_seed);
7560fffc
LP
383
384 if (f->hmac)
385 gcry_md_close(f->hmac);
386#endif
387
cec736d2 388 free(f);
804ae586 389 return NULL;
cec736d2
LP
390}
391
b58c888f
VC
392void journal_file_close_set(Set *s) {
393 JournalFile *f;
394
395 assert(s);
396
397 while ((f = set_steal_first(s)))
398 (void) journal_file_close(f);
399}
400
0ac38b70 401static int journal_file_init_header(JournalFile *f, JournalFile *template) {
d89c8fdf 402 Header h = {};
cec736d2
LP
403 ssize_t k;
404 int r;
405
406 assert(f);
407
7560fffc 408 memcpy(h.signature, HEADER_SIGNATURE, 8);
23b0b2b2 409 h.header_size = htole64(ALIGN64(sizeof(h)));
cec736d2 410
d89c8fdf
ZJS
411 h.incompatible_flags |= htole32(
412 f->compress_xz * HEADER_INCOMPATIBLE_COMPRESSED_XZ |
413 f->compress_lz4 * HEADER_INCOMPATIBLE_COMPRESSED_LZ4);
7560fffc 414
d89c8fdf
ZJS
415 h.compatible_flags = htole32(
416 f->seal * HEADER_COMPATIBLE_SEALED);
7560fffc 417
cec736d2
LP
418 r = sd_id128_randomize(&h.file_id);
419 if (r < 0)
420 return r;
421
0ac38b70
LP
422 if (template) {
423 h.seqnum_id = template->header->seqnum_id;
beec0085 424 h.tail_entry_seqnum = template->header->tail_entry_seqnum;
0ac38b70
LP
425 } else
426 h.seqnum_id = h.file_id;
cec736d2
LP
427
428 k = pwrite(f->fd, &h, sizeof(h), 0);
429 if (k < 0)
430 return -errno;
431
432 if (k != sizeof(h))
433 return -EIO;
434
435 return 0;
436}
437
438static int journal_file_refresh_header(JournalFile *f) {
de190aef 439 sd_id128_t boot_id;
fa6ac760 440 int r;
cec736d2
LP
441
442 assert(f);
c88cc6af 443 assert(f->header);
cec736d2
LP
444
445 r = sd_id128_get_machine(&f->header->machine_id);
446 if (r < 0)
447 return r;
448
de190aef 449 r = sd_id128_get_boot(&boot_id);
cec736d2
LP
450 if (r < 0)
451 return r;
452
de190aef
LP
453 if (sd_id128_equal(boot_id, f->header->boot_id))
454 f->tail_entry_monotonic_valid = true;
455
456 f->header->boot_id = boot_id;
457
fa6ac760 458 r = journal_file_set_online(f);
b788cc23 459
7560fffc 460 /* Sync the online state to disk */
fb426037 461 (void) fsync(f->fd);
b788cc23 462
fa6ac760 463 return r;
cec736d2
LP
464}
465
466static int journal_file_verify_header(JournalFile *f) {
d89c8fdf
ZJS
467 uint32_t flags;
468
cec736d2 469 assert(f);
c88cc6af 470 assert(f->header);
cec736d2 471
7560fffc 472 if (memcmp(f->header->signature, HEADER_SIGNATURE, 8))
cec736d2
LP
473 return -EBADMSG;
474
7560fffc
LP
475 /* In both read and write mode we refuse to open files with
476 * incompatible flags we don't know */
d89c8fdf
ZJS
477 flags = le32toh(f->header->incompatible_flags);
478 if (flags & ~HEADER_INCOMPATIBLE_SUPPORTED) {
479 if (flags & ~HEADER_INCOMPATIBLE_ANY)
480 log_debug("Journal file %s has unknown incompatible flags %"PRIx32,
481 f->path, flags & ~HEADER_INCOMPATIBLE_ANY);
482 flags = (flags & HEADER_INCOMPATIBLE_ANY) & ~HEADER_INCOMPATIBLE_SUPPORTED;
483 if (flags)
484 log_debug("Journal file %s uses incompatible flags %"PRIx32
485 " disabled at compilation time.", f->path, flags);
cec736d2 486 return -EPROTONOSUPPORT;
d89c8fdf 487 }
cec736d2 488
7560fffc
LP
489 /* When open for writing we refuse to open files with
490 * compatible flags, too */
d89c8fdf
ZJS
491 flags = le32toh(f->header->compatible_flags);
492 if (f->writable && (flags & ~HEADER_COMPATIBLE_SUPPORTED)) {
493 if (flags & ~HEADER_COMPATIBLE_ANY)
494 log_debug("Journal file %s has unknown compatible flags %"PRIx32,
495 f->path, flags & ~HEADER_COMPATIBLE_ANY);
496 flags = (flags & HEADER_COMPATIBLE_ANY) & ~HEADER_COMPATIBLE_SUPPORTED;
497 if (flags)
498 log_debug("Journal file %s uses compatible flags %"PRIx32
499 " disabled at compilation time.", f->path, flags);
500 return -EPROTONOSUPPORT;
7560fffc
LP
501 }
502
db11ac1a
LP
503 if (f->header->state >= _STATE_MAX)
504 return -EBADMSG;
505
dca6219e
LP
506 /* The first addition was n_data, so check that we are at least this large */
507 if (le64toh(f->header->header_size) < HEADER_SIZE_MIN)
23b0b2b2
LP
508 return -EBADMSG;
509
8088cbd3 510 if (JOURNAL_HEADER_SEALED(f->header) && !JOURNAL_HEADER_CONTAINS(f->header, n_entry_arrays))
beec0085
LP
511 return -EBADMSG;
512
db11ac1a
LP
513 if ((le64toh(f->header->header_size) + le64toh(f->header->arena_size)) > (uint64_t) f->last_stat.st_size)
514 return -ENODATA;
515
516 if (le64toh(f->header->tail_object_offset) > (le64toh(f->header->header_size) + le64toh(f->header->arena_size)))
517 return -ENODATA;
518
7762e02b
LP
519 if (!VALID64(le64toh(f->header->data_hash_table_offset)) ||
520 !VALID64(le64toh(f->header->field_hash_table_offset)) ||
521 !VALID64(le64toh(f->header->tail_object_offset)) ||
522 !VALID64(le64toh(f->header->entry_array_offset)))
523 return -ENODATA;
524
cec736d2 525 if (f->writable) {
ccdbaf91 526 uint8_t state;
cec736d2
LP
527 sd_id128_t machine_id;
528 int r;
529
530 r = sd_id128_get_machine(&machine_id);
531 if (r < 0)
532 return r;
533
534 if (!sd_id128_equal(machine_id, f->header->machine_id))
535 return -EHOSTDOWN;
536
de190aef 537 state = f->header->state;
cec736d2 538
71fa6f00
LP
539 if (state == STATE_ONLINE) {
540 log_debug("Journal file %s is already online. Assuming unclean closing.", f->path);
541 return -EBUSY;
542 } else if (state == STATE_ARCHIVED)
cec736d2 543 return -ESHUTDOWN;
71fa6f00 544 else if (state != STATE_OFFLINE) {
8facc349 545 log_debug("Journal file %s has unknown state %i.", f->path, state);
71fa6f00
LP
546 return -EBUSY;
547 }
cec736d2
LP
548 }
549
d89c8fdf
ZJS
550 f->compress_xz = JOURNAL_HEADER_COMPRESSED_XZ(f->header);
551 f->compress_lz4 = JOURNAL_HEADER_COMPRESSED_LZ4(f->header);
c586dbf1 552
f1889c91 553 f->seal = JOURNAL_HEADER_SEALED(f->header);
7560fffc 554
cec736d2
LP
555 return 0;
556}
557
2678031a
LP
558static int journal_file_fstat(JournalFile *f) {
559 assert(f);
560 assert(f->fd >= 0);
561
562 if (fstat(f->fd, &f->last_stat) < 0)
563 return -errno;
564
565 f->last_stat_usec = now(CLOCK_MONOTONIC);
566
567 /* Refuse appending to files that are already deleted */
568 if (f->last_stat.st_nlink <= 0)
569 return -EIDRM;
570
571 return 0;
572}
573
cec736d2 574static int journal_file_allocate(JournalFile *f, uint64_t offset, uint64_t size) {
eda4b58b 575 uint64_t old_size, new_size;
fec2aa2f 576 int r;
cec736d2
LP
577
578 assert(f);
c88cc6af 579 assert(f->header);
cec736d2 580
cec736d2 581 /* We assume that this file is not sparse, and we know that
38ac38b2 582 * for sure, since we always call posix_fallocate()
cec736d2
LP
583 * ourselves */
584
fa6ac760
LP
585 if (mmap_cache_got_sigbus(f->mmap, f->fd))
586 return -EIO;
587
cec736d2 588 old_size =
23b0b2b2 589 le64toh(f->header->header_size) +
cec736d2
LP
590 le64toh(f->header->arena_size);
591
bc85bfee 592 new_size = PAGE_ALIGN(offset + size);
23b0b2b2
LP
593 if (new_size < le64toh(f->header->header_size))
594 new_size = le64toh(f->header->header_size);
bc85bfee 595
2678031a
LP
596 if (new_size <= old_size) {
597
598 /* We already pre-allocated enough space, but before
599 * we write to it, let's check with fstat() if the
600 * file got deleted, in order make sure we don't throw
601 * away the data immediately. Don't check fstat() for
602 * all writes though, but only once ever 10s. */
603
604 if (f->last_stat_usec + LAST_STAT_REFRESH_USEC > now(CLOCK_MONOTONIC))
605 return 0;
606
607 return journal_file_fstat(f);
608 }
609
610 /* Allocate more space. */
cec736d2 611
a676e665 612 if (f->metrics.max_size > 0 && new_size > f->metrics.max_size)
bc85bfee 613 return -E2BIG;
cec736d2 614
a676e665 615 if (new_size > f->metrics.min_size && f->metrics.keep_free > 0) {
cec736d2
LP
616 struct statvfs svfs;
617
618 if (fstatvfs(f->fd, &svfs) >= 0) {
619 uint64_t available;
620
070052ab 621 available = LESS_BY((uint64_t) svfs.f_bfree * (uint64_t) svfs.f_bsize, f->metrics.keep_free);
cec736d2
LP
622
623 if (new_size - old_size > available)
624 return -E2BIG;
625 }
626 }
627
eda4b58b
LP
628 /* Increase by larger blocks at once */
629 new_size = ((new_size+FILE_SIZE_INCREASE-1) / FILE_SIZE_INCREASE) * FILE_SIZE_INCREASE;
630 if (f->metrics.max_size > 0 && new_size > f->metrics.max_size)
631 new_size = f->metrics.max_size;
632
bc85bfee
LP
633 /* Note that the glibc fallocate() fallback is very
634 inefficient, hence we try to minimize the allocation area
635 as we can. */
fec2aa2f
GV
636 r = posix_fallocate(f->fd, old_size, new_size - old_size);
637 if (r != 0)
638 return -r;
cec736d2 639
23b0b2b2 640 f->header->arena_size = htole64(new_size - le64toh(f->header->header_size));
cec736d2 641
2678031a 642 return journal_file_fstat(f);
cec736d2
LP
643}
644
78519831 645static unsigned type_to_context(ObjectType type) {
d3d3208f 646 /* One context for each type, plus one catch-all for the rest */
69adae51 647 assert_cc(_OBJECT_TYPE_MAX <= MMAP_CACHE_MAX_CONTEXTS);
fa6ac760 648 assert_cc(CONTEXT_HEADER < MMAP_CACHE_MAX_CONTEXTS);
d05089d8 649 return type > OBJECT_UNUSED && type < _OBJECT_TYPE_MAX ? type : 0;
d3d3208f
MS
650}
651
7a9dabea 652static int journal_file_move_to(JournalFile *f, ObjectType type, bool keep_always, uint64_t offset, uint64_t size, void **ret) {
2678031a
LP
653 int r;
654
cec736d2 655 assert(f);
cec736d2
LP
656 assert(ret);
657
7762e02b
LP
658 if (size <= 0)
659 return -EINVAL;
660
2a59ea54 661 /* Avoid SIGBUS on invalid accesses */
4bbdcdb3
LP
662 if (offset + size > (uint64_t) f->last_stat.st_size) {
663 /* Hmm, out of range? Let's refresh the fstat() data
664 * first, before we trust that check. */
665
2678031a
LP
666 r = journal_file_fstat(f);
667 if (r < 0)
668 return r;
669
670 if (offset + size > (uint64_t) f->last_stat.st_size)
4bbdcdb3
LP
671 return -EADDRNOTAVAIL;
672 }
673
7a9dabea 674 return mmap_cache_get(f->mmap, f->fd, f->prot, type_to_context(type), keep_always, offset, size, &f->last_stat, ret);
cec736d2
LP
675}
676
16e9f408
LP
677static uint64_t minimum_header_size(Object *o) {
678
b8e891e6 679 static const uint64_t table[] = {
16e9f408
LP
680 [OBJECT_DATA] = sizeof(DataObject),
681 [OBJECT_FIELD] = sizeof(FieldObject),
682 [OBJECT_ENTRY] = sizeof(EntryObject),
683 [OBJECT_DATA_HASH_TABLE] = sizeof(HashTableObject),
684 [OBJECT_FIELD_HASH_TABLE] = sizeof(HashTableObject),
685 [OBJECT_ENTRY_ARRAY] = sizeof(EntryArrayObject),
686 [OBJECT_TAG] = sizeof(TagObject),
687 };
688
689 if (o->object.type >= ELEMENTSOF(table) || table[o->object.type] <= 0)
690 return sizeof(ObjectHeader);
691
692 return table[o->object.type];
693}
694
78519831 695int journal_file_move_to_object(JournalFile *f, ObjectType type, uint64_t offset, Object **ret) {
cec736d2
LP
696 int r;
697 void *t;
698 Object *o;
699 uint64_t s;
700
701 assert(f);
702 assert(ret);
703
db11ac1a
LP
704 /* Objects may only be located at multiple of 64 bit */
705 if (!VALID64(offset))
706 return -EFAULT;
707
7a9dabea 708 r = journal_file_move_to(f, type, false, offset, sizeof(ObjectHeader), &t);
cec736d2
LP
709 if (r < 0)
710 return r;
711
712 o = (Object*) t;
713 s = le64toh(o->object.size);
714
715 if (s < sizeof(ObjectHeader))
716 return -EBADMSG;
717
16e9f408
LP
718 if (o->object.type <= OBJECT_UNUSED)
719 return -EBADMSG;
720
721 if (s < minimum_header_size(o))
722 return -EBADMSG;
723
d05089d8 724 if (type > OBJECT_UNUSED && o->object.type != type)
cec736d2
LP
725 return -EBADMSG;
726
727 if (s > sizeof(ObjectHeader)) {
7a9dabea 728 r = journal_file_move_to(f, type, false, offset, s, &t);
cec736d2
LP
729 if (r < 0)
730 return r;
731
732 o = (Object*) t;
733 }
734
cec736d2
LP
735 *ret = o;
736 return 0;
737}
738
d98cc1f2 739static uint64_t journal_file_entry_seqnum(JournalFile *f, uint64_t *seqnum) {
cec736d2
LP
740 uint64_t r;
741
742 assert(f);
c88cc6af 743 assert(f->header);
cec736d2 744
beec0085 745 r = le64toh(f->header->tail_entry_seqnum) + 1;
c2373f84
LP
746
747 if (seqnum) {
de190aef 748 /* If an external seqnum counter was passed, we update
c2373f84
LP
749 * both the local and the external one, and set it to
750 * the maximum of both */
751
752 if (*seqnum + 1 > r)
753 r = *seqnum + 1;
754
755 *seqnum = r;
756 }
757
beec0085 758 f->header->tail_entry_seqnum = htole64(r);
cec736d2 759
beec0085
LP
760 if (f->header->head_entry_seqnum == 0)
761 f->header->head_entry_seqnum = htole64(r);
de190aef 762
cec736d2
LP
763 return r;
764}
765
78519831 766int journal_file_append_object(JournalFile *f, ObjectType type, uint64_t size, Object **ret, uint64_t *offset) {
cec736d2
LP
767 int r;
768 uint64_t p;
769 Object *tail, *o;
770 void *t;
771
772 assert(f);
c88cc6af 773 assert(f->header);
d05089d8 774 assert(type > OBJECT_UNUSED && type < _OBJECT_TYPE_MAX);
cec736d2
LP
775 assert(size >= sizeof(ObjectHeader));
776 assert(offset);
777 assert(ret);
778
26687bf8
OS
779 r = journal_file_set_online(f);
780 if (r < 0)
781 return r;
782
cec736d2 783 p = le64toh(f->header->tail_object_offset);
cec736d2 784 if (p == 0)
23b0b2b2 785 p = le64toh(f->header->header_size);
cec736d2 786 else {
d05089d8 787 r = journal_file_move_to_object(f, OBJECT_UNUSED, p, &tail);
cec736d2
LP
788 if (r < 0)
789 return r;
790
791 p += ALIGN64(le64toh(tail->object.size));
792 }
793
794 r = journal_file_allocate(f, p, size);
795 if (r < 0)
796 return r;
797
fcde2389 798 r = journal_file_move_to(f, type, false, p, size, &t);
cec736d2
LP
799 if (r < 0)
800 return r;
801
802 o = (Object*) t;
803
804 zero(o->object);
de190aef 805 o->object.type = type;
cec736d2
LP
806 o->object.size = htole64(size);
807
808 f->header->tail_object_offset = htole64(p);
cec736d2
LP
809 f->header->n_objects = htole64(le64toh(f->header->n_objects) + 1);
810
811 *ret = o;
812 *offset = p;
813
814 return 0;
815}
816
de190aef 817static int journal_file_setup_data_hash_table(JournalFile *f) {
cec736d2
LP
818 uint64_t s, p;
819 Object *o;
820 int r;
821
822 assert(f);
c88cc6af 823 assert(f->header);
cec736d2 824
070052ab
LP
825 /* We estimate that we need 1 hash table entry per 768 bytes
826 of journal file and we want to make sure we never get
827 beyond 75% fill level. Calculate the hash table size for
828 the maximum file size based on these metrics. */
4a92baf3 829
dfabe643 830 s = (f->metrics.max_size * 4 / 768 / 3) * sizeof(HashItem);
4a92baf3
LP
831 if (s < DEFAULT_DATA_HASH_TABLE_SIZE)
832 s = DEFAULT_DATA_HASH_TABLE_SIZE;
833
507f22bd 834 log_debug("Reserving %"PRIu64" entries in hash table.", s / sizeof(HashItem));
4a92baf3 835
de190aef
LP
836 r = journal_file_append_object(f,
837 OBJECT_DATA_HASH_TABLE,
838 offsetof(Object, hash_table.items) + s,
839 &o, &p);
cec736d2
LP
840 if (r < 0)
841 return r;
842
29804cc1 843 memzero(o->hash_table.items, s);
cec736d2 844
de190aef
LP
845 f->header->data_hash_table_offset = htole64(p + offsetof(Object, hash_table.items));
846 f->header->data_hash_table_size = htole64(s);
cec736d2
LP
847
848 return 0;
849}
850
de190aef 851static int journal_file_setup_field_hash_table(JournalFile *f) {
cec736d2
LP
852 uint64_t s, p;
853 Object *o;
854 int r;
855
856 assert(f);
c88cc6af 857 assert(f->header);
cec736d2 858
3c1668da
LP
859 /* We use a fixed size hash table for the fields as this
860 * number should grow very slowly only */
861
de190aef
LP
862 s = DEFAULT_FIELD_HASH_TABLE_SIZE;
863 r = journal_file_append_object(f,
864 OBJECT_FIELD_HASH_TABLE,
865 offsetof(Object, hash_table.items) + s,
866 &o, &p);
cec736d2
LP
867 if (r < 0)
868 return r;
869
29804cc1 870 memzero(o->hash_table.items, s);
cec736d2 871
de190aef
LP
872 f->header->field_hash_table_offset = htole64(p + offsetof(Object, hash_table.items));
873 f->header->field_hash_table_size = htole64(s);
cec736d2
LP
874
875 return 0;
876}
877
dade37d4 878int journal_file_map_data_hash_table(JournalFile *f) {
cec736d2
LP
879 uint64_t s, p;
880 void *t;
881 int r;
882
883 assert(f);
c88cc6af 884 assert(f->header);
cec736d2 885
dade37d4
LP
886 if (f->data_hash_table)
887 return 0;
888
de190aef
LP
889 p = le64toh(f->header->data_hash_table_offset);
890 s = le64toh(f->header->data_hash_table_size);
cec736d2 891
de190aef 892 r = journal_file_move_to(f,
16e9f408 893 OBJECT_DATA_HASH_TABLE,
fcde2389 894 true,
de190aef
LP
895 p, s,
896 &t);
cec736d2
LP
897 if (r < 0)
898 return r;
899
de190aef 900 f->data_hash_table = t;
cec736d2
LP
901 return 0;
902}
903
dade37d4 904int journal_file_map_field_hash_table(JournalFile *f) {
cec736d2
LP
905 uint64_t s, p;
906 void *t;
907 int r;
908
909 assert(f);
c88cc6af 910 assert(f->header);
cec736d2 911
dade37d4
LP
912 if (f->field_hash_table)
913 return 0;
914
de190aef
LP
915 p = le64toh(f->header->field_hash_table_offset);
916 s = le64toh(f->header->field_hash_table_size);
cec736d2 917
de190aef 918 r = journal_file_move_to(f,
16e9f408 919 OBJECT_FIELD_HASH_TABLE,
fcde2389 920 true,
de190aef
LP
921 p, s,
922 &t);
cec736d2
LP
923 if (r < 0)
924 return r;
925
de190aef 926 f->field_hash_table = t;
cec736d2
LP
927 return 0;
928}
929
3c1668da
LP
930static int journal_file_link_field(
931 JournalFile *f,
932 Object *o,
933 uint64_t offset,
934 uint64_t hash) {
935
805d1486 936 uint64_t p, h, m;
3c1668da
LP
937 int r;
938
939 assert(f);
c88cc6af 940 assert(f->header);
90d222c1 941 assert(f->field_hash_table);
3c1668da
LP
942 assert(o);
943 assert(offset > 0);
944
945 if (o->object.type != OBJECT_FIELD)
946 return -EINVAL;
947
805d1486
LP
948 m = le64toh(f->header->field_hash_table_size) / sizeof(HashItem);
949 if (m <= 0)
950 return -EBADMSG;
3c1668da 951
805d1486 952 /* This might alter the window we are looking at */
3c1668da
LP
953 o->field.next_hash_offset = o->field.head_data_offset = 0;
954
805d1486 955 h = hash % m;
3c1668da
LP
956 p = le64toh(f->field_hash_table[h].tail_hash_offset);
957 if (p == 0)
958 f->field_hash_table[h].head_hash_offset = htole64(offset);
959 else {
960 r = journal_file_move_to_object(f, OBJECT_FIELD, p, &o);
961 if (r < 0)
962 return r;
963
964 o->field.next_hash_offset = htole64(offset);
965 }
966
967 f->field_hash_table[h].tail_hash_offset = htole64(offset);
968
969 if (JOURNAL_HEADER_CONTAINS(f->header, n_fields))
970 f->header->n_fields = htole64(le64toh(f->header->n_fields) + 1);
971
972 return 0;
973}
974
975static int journal_file_link_data(
976 JournalFile *f,
977 Object *o,
978 uint64_t offset,
979 uint64_t hash) {
980
805d1486 981 uint64_t p, h, m;
cec736d2
LP
982 int r;
983
984 assert(f);
c88cc6af 985 assert(f->header);
90d222c1 986 assert(f->data_hash_table);
cec736d2
LP
987 assert(o);
988 assert(offset > 0);
b588975f
LP
989
990 if (o->object.type != OBJECT_DATA)
991 return -EINVAL;
cec736d2 992
805d1486
LP
993 m = le64toh(f->header->data_hash_table_size) / sizeof(HashItem);
994 if (m <= 0)
995 return -EBADMSG;
48496df6 996
805d1486 997 /* This might alter the window we are looking at */
de190aef
LP
998 o->data.next_hash_offset = o->data.next_field_offset = 0;
999 o->data.entry_offset = o->data.entry_array_offset = 0;
1000 o->data.n_entries = 0;
cec736d2 1001
805d1486 1002 h = hash % m;
8db4213e 1003 p = le64toh(f->data_hash_table[h].tail_hash_offset);
3c1668da 1004 if (p == 0)
cec736d2 1005 /* Only entry in the hash table is easy */
de190aef 1006 f->data_hash_table[h].head_hash_offset = htole64(offset);
3c1668da 1007 else {
48496df6
LP
1008 /* Move back to the previous data object, to patch in
1009 * pointer */
cec736d2 1010
de190aef 1011 r = journal_file_move_to_object(f, OBJECT_DATA, p, &o);
cec736d2
LP
1012 if (r < 0)
1013 return r;
1014
de190aef 1015 o->data.next_hash_offset = htole64(offset);
cec736d2
LP
1016 }
1017
de190aef 1018 f->data_hash_table[h].tail_hash_offset = htole64(offset);
cec736d2 1019
dca6219e
LP
1020 if (JOURNAL_HEADER_CONTAINS(f->header, n_data))
1021 f->header->n_data = htole64(le64toh(f->header->n_data) + 1);
1022
cec736d2
LP
1023 return 0;
1024}
1025
3c1668da
LP
1026int journal_file_find_field_object_with_hash(
1027 JournalFile *f,
1028 const void *field, uint64_t size, uint64_t hash,
1029 Object **ret, uint64_t *offset) {
1030
805d1486 1031 uint64_t p, osize, h, m;
3c1668da
LP
1032 int r;
1033
1034 assert(f);
c88cc6af 1035 assert(f->header);
3c1668da
LP
1036 assert(field && size > 0);
1037
dade37d4
LP
1038 /* If the field hash table is empty, we can't find anything */
1039 if (le64toh(f->header->field_hash_table_size) <= 0)
1040 return 0;
1041
1042 /* Map the field hash table, if it isn't mapped yet. */
1043 r = journal_file_map_field_hash_table(f);
1044 if (r < 0)
1045 return r;
1046
3c1668da
LP
1047 osize = offsetof(Object, field.payload) + size;
1048
805d1486 1049 m = le64toh(f->header->field_hash_table_size) / sizeof(HashItem);
805d1486 1050 if (m <= 0)
3c1668da
LP
1051 return -EBADMSG;
1052
805d1486 1053 h = hash % m;
3c1668da
LP
1054 p = le64toh(f->field_hash_table[h].head_hash_offset);
1055
1056 while (p > 0) {
1057 Object *o;
1058
1059 r = journal_file_move_to_object(f, OBJECT_FIELD, p, &o);
1060 if (r < 0)
1061 return r;
1062
1063 if (le64toh(o->field.hash) == hash &&
1064 le64toh(o->object.size) == osize &&
1065 memcmp(o->field.payload, field, size) == 0) {
1066
1067 if (ret)
1068 *ret = o;
1069 if (offset)
1070 *offset = p;
1071
1072 return 1;
1073 }
1074
1075 p = le64toh(o->field.next_hash_offset);
1076 }
1077
1078 return 0;
1079}
1080
1081int journal_file_find_field_object(
1082 JournalFile *f,
1083 const void *field, uint64_t size,
1084 Object **ret, uint64_t *offset) {
1085
1086 uint64_t hash;
1087
1088 assert(f);
1089 assert(field && size > 0);
1090
1091 hash = hash64(field, size);
1092
1093 return journal_file_find_field_object_with_hash(f,
1094 field, size, hash,
1095 ret, offset);
1096}
1097
de190aef
LP
1098int journal_file_find_data_object_with_hash(
1099 JournalFile *f,
1100 const void *data, uint64_t size, uint64_t hash,
1101 Object **ret, uint64_t *offset) {
48496df6 1102
805d1486 1103 uint64_t p, osize, h, m;
cec736d2
LP
1104 int r;
1105
1106 assert(f);
c88cc6af 1107 assert(f->header);
cec736d2
LP
1108 assert(data || size == 0);
1109
dade37d4
LP
1110 /* If there's no data hash table, then there's no entry. */
1111 if (le64toh(f->header->data_hash_table_size) <= 0)
1112 return 0;
1113
1114 /* Map the data hash table, if it isn't mapped yet. */
1115 r = journal_file_map_data_hash_table(f);
1116 if (r < 0)
1117 return r;
1118
cec736d2
LP
1119 osize = offsetof(Object, data.payload) + size;
1120
805d1486
LP
1121 m = le64toh(f->header->data_hash_table_size) / sizeof(HashItem);
1122 if (m <= 0)
bc85bfee
LP
1123 return -EBADMSG;
1124
805d1486 1125 h = hash % m;
de190aef 1126 p = le64toh(f->data_hash_table[h].head_hash_offset);
cec736d2 1127
de190aef
LP
1128 while (p > 0) {
1129 Object *o;
cec736d2 1130
de190aef 1131 r = journal_file_move_to_object(f, OBJECT_DATA, p, &o);
cec736d2
LP
1132 if (r < 0)
1133 return r;
1134
807e17f0 1135 if (le64toh(o->data.hash) != hash)
85a131e8 1136 goto next;
807e17f0 1137
d89c8fdf 1138 if (o->object.flags & OBJECT_COMPRESSION_MASK) {
3b1a55e1 1139#if defined(HAVE_XZ) || defined(HAVE_LZ4)
fa1c4b51 1140 uint64_t l;
a7f7d1bd 1141 size_t rsize = 0;
cec736d2 1142
807e17f0
LP
1143 l = le64toh(o->object.size);
1144 if (l <= offsetof(Object, data.payload))
cec736d2
LP
1145 return -EBADMSG;
1146
807e17f0
LP
1147 l -= offsetof(Object, data.payload);
1148
d89c8fdf
ZJS
1149 r = decompress_blob(o->object.flags & OBJECT_COMPRESSION_MASK,
1150 o->data.payload, l, &f->compress_buffer, &f->compress_buffer_size, &rsize, 0);
1151 if (r < 0)
1152 return r;
807e17f0 1153
b785c858 1154 if (rsize == size &&
807e17f0
LP
1155 memcmp(f->compress_buffer, data, size) == 0) {
1156
1157 if (ret)
1158 *ret = o;
1159
1160 if (offset)
1161 *offset = p;
1162
1163 return 1;
1164 }
3b1a55e1
ZJS
1165#else
1166 return -EPROTONOSUPPORT;
1167#endif
807e17f0
LP
1168 } else if (le64toh(o->object.size) == osize &&
1169 memcmp(o->data.payload, data, size) == 0) {
1170
cec736d2
LP
1171 if (ret)
1172 *ret = o;
1173
1174 if (offset)
1175 *offset = p;
1176
de190aef 1177 return 1;
cec736d2
LP
1178 }
1179
85a131e8 1180 next:
cec736d2
LP
1181 p = le64toh(o->data.next_hash_offset);
1182 }
1183
de190aef
LP
1184 return 0;
1185}
1186
1187int journal_file_find_data_object(
1188 JournalFile *f,
1189 const void *data, uint64_t size,
1190 Object **ret, uint64_t *offset) {
1191
1192 uint64_t hash;
1193
1194 assert(f);
1195 assert(data || size == 0);
1196
1197 hash = hash64(data, size);
1198
1199 return journal_file_find_data_object_with_hash(f,
1200 data, size, hash,
1201 ret, offset);
1202}
1203
3c1668da
LP
1204static int journal_file_append_field(
1205 JournalFile *f,
1206 const void *field, uint64_t size,
1207 Object **ret, uint64_t *offset) {
1208
1209 uint64_t hash, p;
1210 uint64_t osize;
1211 Object *o;
1212 int r;
1213
1214 assert(f);
1215 assert(field && size > 0);
1216
1217 hash = hash64(field, size);
1218
1219 r = journal_file_find_field_object_with_hash(f, field, size, hash, &o, &p);
1220 if (r < 0)
1221 return r;
1222 else if (r > 0) {
1223
1224 if (ret)
1225 *ret = o;
1226
1227 if (offset)
1228 *offset = p;
1229
1230 return 0;
1231 }
1232
1233 osize = offsetof(Object, field.payload) + size;
1234 r = journal_file_append_object(f, OBJECT_FIELD, osize, &o, &p);
8c92d4bb
LP
1235 if (r < 0)
1236 return r;
3c1668da
LP
1237
1238 o->field.hash = htole64(hash);
1239 memcpy(o->field.payload, field, size);
1240
1241 r = journal_file_link_field(f, o, p, hash);
1242 if (r < 0)
1243 return r;
1244
1245 /* The linking might have altered the window, so let's
1246 * refresh our pointer */
1247 r = journal_file_move_to_object(f, OBJECT_FIELD, p, &o);
1248 if (r < 0)
1249 return r;
1250
1251#ifdef HAVE_GCRYPT
1252 r = journal_file_hmac_put_object(f, OBJECT_FIELD, o, p);
1253 if (r < 0)
1254 return r;
1255#endif
1256
1257 if (ret)
1258 *ret = o;
1259
1260 if (offset)
1261 *offset = p;
1262
1263 return 0;
1264}
1265
48496df6
LP
1266static int journal_file_append_data(
1267 JournalFile *f,
1268 const void *data, uint64_t size,
1269 Object **ret, uint64_t *offset) {
1270
de190aef
LP
1271 uint64_t hash, p;
1272 uint64_t osize;
1273 Object *o;
d89c8fdf 1274 int r, compression = 0;
3c1668da 1275 const void *eq;
de190aef
LP
1276
1277 assert(f);
1278 assert(data || size == 0);
1279
1280 hash = hash64(data, size);
1281
1282 r = journal_file_find_data_object_with_hash(f, data, size, hash, &o, &p);
1283 if (r < 0)
1284 return r;
0240c603 1285 if (r > 0) {
de190aef
LP
1286
1287 if (ret)
1288 *ret = o;
1289
1290 if (offset)
1291 *offset = p;
1292
1293 return 0;
1294 }
1295
1296 osize = offsetof(Object, data.payload) + size;
1297 r = journal_file_append_object(f, OBJECT_DATA, osize, &o, &p);
cec736d2
LP
1298 if (r < 0)
1299 return r;
1300
cec736d2 1301 o->data.hash = htole64(hash);
807e17f0 1302
d89c8fdf 1303#if defined(HAVE_XZ) || defined(HAVE_LZ4)
d1afbcd2 1304 if (JOURNAL_FILE_COMPRESS(f) && size >= COMPRESSION_SIZE_THRESHOLD) {
a7f7d1bd 1305 size_t rsize = 0;
807e17f0 1306
5d6f46b6 1307 compression = compress_blob(data, size, o->data.payload, size - 1, &rsize);
807e17f0 1308
d1afbcd2 1309 if (compression >= 0) {
807e17f0 1310 o->object.size = htole64(offsetof(Object, data.payload) + rsize);
d89c8fdf 1311 o->object.flags |= compression;
807e17f0 1312
fa1c4b51 1313 log_debug("Compressed data object %"PRIu64" -> %zu using %s",
d89c8fdf 1314 size, rsize, object_compressed_to_string(compression));
d1afbcd2
LP
1315 } else
1316 /* Compression didn't work, we don't really care why, let's continue without compression */
1317 compression = 0;
807e17f0
LP
1318 }
1319#endif
1320
75f32f04
ZJS
1321 if (compression == 0)
1322 memcpy_safe(o->data.payload, data, size);
cec736d2 1323
de190aef 1324 r = journal_file_link_data(f, o, p, hash);
cec736d2
LP
1325 if (r < 0)
1326 return r;
1327
48496df6
LP
1328 /* The linking might have altered the window, so let's
1329 * refresh our pointer */
1330 r = journal_file_move_to_object(f, OBJECT_DATA, p, &o);
1331 if (r < 0)
1332 return r;
1333
08c6f819
SL
1334 if (!data)
1335 eq = NULL;
1336 else
1337 eq = memchr(data, '=', size);
3c1668da 1338 if (eq && eq > data) {
748db592 1339 Object *fo = NULL;
3c1668da 1340 uint64_t fp;
3c1668da
LP
1341
1342 /* Create field object ... */
1343 r = journal_file_append_field(f, data, (uint8_t*) eq - (uint8_t*) data, &fo, &fp);
1344 if (r < 0)
1345 return r;
1346
1347 /* ... and link it in. */
1348 o->data.next_field_offset = fo->field.head_data_offset;
1349 fo->field.head_data_offset = le64toh(p);
1350 }
1351
5996c7c2
LP
1352#ifdef HAVE_GCRYPT
1353 r = journal_file_hmac_put_object(f, OBJECT_DATA, o, p);
1354 if (r < 0)
1355 return r;
1356#endif
1357
cec736d2
LP
1358 if (ret)
1359 *ret = o;
1360
1361 if (offset)
de190aef 1362 *offset = p;
cec736d2
LP
1363
1364 return 0;
1365}
1366
1367uint64_t journal_file_entry_n_items(Object *o) {
1368 assert(o);
b588975f
LP
1369
1370 if (o->object.type != OBJECT_ENTRY)
1371 return 0;
cec736d2
LP
1372
1373 return (le64toh(o->object.size) - offsetof(Object, entry.items)) / sizeof(EntryItem);
1374}
1375
0284adc6 1376uint64_t journal_file_entry_array_n_items(Object *o) {
de190aef 1377 assert(o);
b588975f
LP
1378
1379 if (o->object.type != OBJECT_ENTRY_ARRAY)
1380 return 0;
de190aef
LP
1381
1382 return (le64toh(o->object.size) - offsetof(Object, entry_array.items)) / sizeof(uint64_t);
1383}
1384
fb9a24b6
LP
1385uint64_t journal_file_hash_table_n_items(Object *o) {
1386 assert(o);
b588975f
LP
1387
1388 if (o->object.type != OBJECT_DATA_HASH_TABLE &&
1389 o->object.type != OBJECT_FIELD_HASH_TABLE)
1390 return 0;
fb9a24b6
LP
1391
1392 return (le64toh(o->object.size) - offsetof(Object, hash_table.items)) / sizeof(HashItem);
1393}
1394
de190aef 1395static int link_entry_into_array(JournalFile *f,
4fd052ae
FC
1396 le64_t *first,
1397 le64_t *idx,
de190aef 1398 uint64_t p) {
cec736d2 1399 int r;
de190aef
LP
1400 uint64_t n = 0, ap = 0, q, i, a, hidx;
1401 Object *o;
1402
cec736d2 1403 assert(f);
c88cc6af 1404 assert(f->header);
de190aef
LP
1405 assert(first);
1406 assert(idx);
1407 assert(p > 0);
cec736d2 1408
de190aef
LP
1409 a = le64toh(*first);
1410 i = hidx = le64toh(*idx);
1411 while (a > 0) {
1412
1413 r = journal_file_move_to_object(f, OBJECT_ENTRY_ARRAY, a, &o);
1414 if (r < 0)
1415 return r;
cec736d2 1416
de190aef
LP
1417 n = journal_file_entry_array_n_items(o);
1418 if (i < n) {
1419 o->entry_array.items[i] = htole64(p);
1420 *idx = htole64(hidx + 1);
1421 return 0;
1422 }
cec736d2 1423
de190aef
LP
1424 i -= n;
1425 ap = a;
1426 a = le64toh(o->entry_array.next_entry_array_offset);
1427 }
1428
1429 if (hidx > n)
1430 n = (hidx+1) * 2;
1431 else
1432 n = n * 2;
1433
1434 if (n < 4)
1435 n = 4;
1436
1437 r = journal_file_append_object(f, OBJECT_ENTRY_ARRAY,
1438 offsetof(Object, entry_array.items) + n * sizeof(uint64_t),
1439 &o, &q);
cec736d2
LP
1440 if (r < 0)
1441 return r;
1442
feb12d3e 1443#ifdef HAVE_GCRYPT
5996c7c2 1444 r = journal_file_hmac_put_object(f, OBJECT_ENTRY_ARRAY, o, q);
b0af6f41
LP
1445 if (r < 0)
1446 return r;
feb12d3e 1447#endif
b0af6f41 1448
de190aef 1449 o->entry_array.items[i] = htole64(p);
cec736d2 1450
de190aef 1451 if (ap == 0)
7be3aa17 1452 *first = htole64(q);
cec736d2 1453 else {
de190aef 1454 r = journal_file_move_to_object(f, OBJECT_ENTRY_ARRAY, ap, &o);
cec736d2
LP
1455 if (r < 0)
1456 return r;
1457
de190aef
LP
1458 o->entry_array.next_entry_array_offset = htole64(q);
1459 }
cec736d2 1460
2dee23eb
LP
1461 if (JOURNAL_HEADER_CONTAINS(f->header, n_entry_arrays))
1462 f->header->n_entry_arrays = htole64(le64toh(f->header->n_entry_arrays) + 1);
1463
de190aef
LP
1464 *idx = htole64(hidx + 1);
1465
1466 return 0;
1467}
cec736d2 1468
de190aef 1469static int link_entry_into_array_plus_one(JournalFile *f,
4fd052ae
FC
1470 le64_t *extra,
1471 le64_t *first,
1472 le64_t *idx,
de190aef
LP
1473 uint64_t p) {
1474
1475 int r;
1476
1477 assert(f);
1478 assert(extra);
1479 assert(first);
1480 assert(idx);
1481 assert(p > 0);
1482
1483 if (*idx == 0)
1484 *extra = htole64(p);
1485 else {
4fd052ae 1486 le64_t i;
de190aef 1487
7be3aa17 1488 i = htole64(le64toh(*idx) - 1);
de190aef
LP
1489 r = link_entry_into_array(f, first, &i, p);
1490 if (r < 0)
1491 return r;
cec736d2
LP
1492 }
1493
de190aef
LP
1494 *idx = htole64(le64toh(*idx) + 1);
1495 return 0;
1496}
1497
1498static int journal_file_link_entry_item(JournalFile *f, Object *o, uint64_t offset, uint64_t i) {
1499 uint64_t p;
1500 int r;
1501 assert(f);
1502 assert(o);
1503 assert(offset > 0);
1504
1505 p = le64toh(o->entry.items[i].object_offset);
1506 if (p == 0)
1507 return -EINVAL;
1508
1509 r = journal_file_move_to_object(f, OBJECT_DATA, p, &o);
cec736d2
LP
1510 if (r < 0)
1511 return r;
1512
de190aef
LP
1513 return link_entry_into_array_plus_one(f,
1514 &o->data.entry_offset,
1515 &o->data.entry_array_offset,
1516 &o->data.n_entries,
1517 offset);
cec736d2
LP
1518}
1519
1520static int journal_file_link_entry(JournalFile *f, Object *o, uint64_t offset) {
de190aef 1521 uint64_t n, i;
cec736d2
LP
1522 int r;
1523
1524 assert(f);
c88cc6af 1525 assert(f->header);
cec736d2
LP
1526 assert(o);
1527 assert(offset > 0);
b588975f
LP
1528
1529 if (o->object.type != OBJECT_ENTRY)
1530 return -EINVAL;
cec736d2 1531
b788cc23
LP
1532 __sync_synchronize();
1533
cec736d2 1534 /* Link up the entry itself */
de190aef
LP
1535 r = link_entry_into_array(f,
1536 &f->header->entry_array_offset,
1537 &f->header->n_entries,
1538 offset);
1539 if (r < 0)
1540 return r;
cec736d2 1541
507f22bd 1542 /* log_debug("=> %s seqnr=%"PRIu64" n_entries=%"PRIu64, f->path, o->entry.seqnum, f->header->n_entries); */
cec736d2 1543
de190aef 1544 if (f->header->head_entry_realtime == 0)
0ac38b70 1545 f->header->head_entry_realtime = o->entry.realtime;
cec736d2 1546
0ac38b70 1547 f->header->tail_entry_realtime = o->entry.realtime;
de190aef
LP
1548 f->header->tail_entry_monotonic = o->entry.monotonic;
1549
1550 f->tail_entry_monotonic_valid = true;
cec736d2
LP
1551
1552 /* Link up the items */
1553 n = journal_file_entry_n_items(o);
1554 for (i = 0; i < n; i++) {
1555 r = journal_file_link_entry_item(f, o, offset, i);
1556 if (r < 0)
1557 return r;
1558 }
1559
cec736d2
LP
1560 return 0;
1561}
1562
1563static int journal_file_append_entry_internal(
1564 JournalFile *f,
1565 const dual_timestamp *ts,
1566 uint64_t xor_hash,
1567 const EntryItem items[], unsigned n_items,
de190aef 1568 uint64_t *seqnum,
cec736d2
LP
1569 Object **ret, uint64_t *offset) {
1570 uint64_t np;
1571 uint64_t osize;
1572 Object *o;
1573 int r;
1574
1575 assert(f);
c88cc6af 1576 assert(f->header);
cec736d2 1577 assert(items || n_items == 0);
de190aef 1578 assert(ts);
cec736d2
LP
1579
1580 osize = offsetof(Object, entry.items) + (n_items * sizeof(EntryItem));
1581
de190aef 1582 r = journal_file_append_object(f, OBJECT_ENTRY, osize, &o, &np);
cec736d2
LP
1583 if (r < 0)
1584 return r;
1585
d98cc1f2 1586 o->entry.seqnum = htole64(journal_file_entry_seqnum(f, seqnum));
75f32f04 1587 memcpy_safe(o->entry.items, items, n_items * sizeof(EntryItem));
de190aef
LP
1588 o->entry.realtime = htole64(ts->realtime);
1589 o->entry.monotonic = htole64(ts->monotonic);
cec736d2
LP
1590 o->entry.xor_hash = htole64(xor_hash);
1591 o->entry.boot_id = f->header->boot_id;
1592
feb12d3e 1593#ifdef HAVE_GCRYPT
5996c7c2 1594 r = journal_file_hmac_put_object(f, OBJECT_ENTRY, o, np);
b0af6f41
LP
1595 if (r < 0)
1596 return r;
feb12d3e 1597#endif
b0af6f41 1598
cec736d2
LP
1599 r = journal_file_link_entry(f, o, np);
1600 if (r < 0)
1601 return r;
1602
1603 if (ret)
1604 *ret = o;
1605
1606 if (offset)
1607 *offset = np;
1608
1609 return 0;
1610}
1611
cf244689 1612void journal_file_post_change(JournalFile *f) {
50f20cfd
LP
1613 assert(f);
1614
1615 /* inotify() does not receive IN_MODIFY events from file
1616 * accesses done via mmap(). After each access we hence
1617 * trigger IN_MODIFY by truncating the journal file to its
1618 * current size which triggers IN_MODIFY. */
1619
bc85bfee
LP
1620 __sync_synchronize();
1621
50f20cfd 1622 if (ftruncate(f->fd, f->last_stat.st_size) < 0)
e167d7fd 1623 log_debug_errno(errno, "Failed to truncate file to its own size: %m");
50f20cfd
LP
1624}
1625
7a24f3bf
VC
1626static int post_change_thunk(sd_event_source *timer, uint64_t usec, void *userdata) {
1627 assert(userdata);
1628
1629 journal_file_post_change(userdata);
1630
1631 return 1;
1632}
1633
1634static void schedule_post_change(JournalFile *f) {
1635 sd_event_source *timer;
1636 int enabled, r;
1637 uint64_t now;
1638
1639 assert(f);
1640 assert(f->post_change_timer);
1641
1642 timer = f->post_change_timer;
1643
1644 r = sd_event_source_get_enabled(timer, &enabled);
1645 if (r < 0) {
e167d7fd
LP
1646 log_debug_errno(r, "Failed to get ftruncate timer state: %m");
1647 goto fail;
7a24f3bf
VC
1648 }
1649
1650 if (enabled == SD_EVENT_ONESHOT)
1651 return;
1652
1653 r = sd_event_now(sd_event_source_get_event(timer), CLOCK_MONOTONIC, &now);
1654 if (r < 0) {
e167d7fd
LP
1655 log_debug_errno(r, "Failed to get clock's now for scheduling ftruncate: %m");
1656 goto fail;
7a24f3bf
VC
1657 }
1658
1659 r = sd_event_source_set_time(timer, now+f->post_change_timer_period);
1660 if (r < 0) {
e167d7fd
LP
1661 log_debug_errno(r, "Failed to set time for scheduling ftruncate: %m");
1662 goto fail;
7a24f3bf
VC
1663 }
1664
1665 r = sd_event_source_set_enabled(timer, SD_EVENT_ONESHOT);
1666 if (r < 0) {
e167d7fd
LP
1667 log_debug_errno(r, "Failed to enable scheduled ftruncate: %m");
1668 goto fail;
7a24f3bf 1669 }
e167d7fd
LP
1670
1671 return;
1672
1673fail:
1674 /* On failure, let's simply post the change immediately. */
1675 journal_file_post_change(f);
7a24f3bf
VC
1676}
1677
1678/* Enable coalesced change posting in a timer on the provided sd_event instance */
1679int journal_file_enable_post_change_timer(JournalFile *f, sd_event *e, usec_t t) {
1680 _cleanup_(sd_event_source_unrefp) sd_event_source *timer = NULL;
1681 int r;
1682
1683 assert(f);
1684 assert_return(!f->post_change_timer, -EINVAL);
1685 assert(e);
1686 assert(t);
1687
1688 r = sd_event_add_time(e, &timer, CLOCK_MONOTONIC, 0, 0, post_change_thunk, f);
1689 if (r < 0)
1690 return r;
1691
1692 r = sd_event_source_set_enabled(timer, SD_EVENT_OFF);
1693 if (r < 0)
1694 return r;
1695
1696 f->post_change_timer = timer;
1697 timer = NULL;
1698 f->post_change_timer_period = t;
1699
1700 return r;
1701}
1702
1f2da9ec
LP
1703static int entry_item_cmp(const void *_a, const void *_b) {
1704 const EntryItem *a = _a, *b = _b;
1705
1706 if (le64toh(a->object_offset) < le64toh(b->object_offset))
1707 return -1;
1708 if (le64toh(a->object_offset) > le64toh(b->object_offset))
1709 return 1;
1710 return 0;
1711}
1712
de190aef 1713int journal_file_append_entry(JournalFile *f, const dual_timestamp *ts, const struct iovec iovec[], unsigned n_iovec, uint64_t *seqnum, Object **ret, uint64_t *offset) {
cec736d2
LP
1714 unsigned i;
1715 EntryItem *items;
1716 int r;
1717 uint64_t xor_hash = 0;
de190aef 1718 struct dual_timestamp _ts;
cec736d2
LP
1719
1720 assert(f);
c88cc6af 1721 assert(f->header);
cec736d2
LP
1722 assert(iovec || n_iovec == 0);
1723
de190aef
LP
1724 if (!ts) {
1725 dual_timestamp_get(&_ts);
1726 ts = &_ts;
1727 }
1728
feb12d3e 1729#ifdef HAVE_GCRYPT
7560fffc
LP
1730 r = journal_file_maybe_append_tag(f, ts->realtime);
1731 if (r < 0)
1732 return r;
feb12d3e 1733#endif
7560fffc 1734
64825d3c 1735 /* alloca() can't take 0, hence let's allocate at least one */
9607d947 1736 items = alloca(sizeof(EntryItem) * MAX(1u, n_iovec));
cec736d2
LP
1737
1738 for (i = 0; i < n_iovec; i++) {
1739 uint64_t p;
1740 Object *o;
1741
1742 r = journal_file_append_data(f, iovec[i].iov_base, iovec[i].iov_len, &o, &p);
1743 if (r < 0)
cf244689 1744 return r;
cec736d2
LP
1745
1746 xor_hash ^= le64toh(o->data.hash);
1747 items[i].object_offset = htole64(p);
de7b95cd 1748 items[i].hash = o->data.hash;
cec736d2
LP
1749 }
1750
1f2da9ec
LP
1751 /* Order by the position on disk, in order to improve seek
1752 * times for rotating media. */
7ff7394d 1753 qsort_safe(items, n_iovec, sizeof(EntryItem), entry_item_cmp);
1f2da9ec 1754
de190aef 1755 r = journal_file_append_entry_internal(f, ts, xor_hash, items, n_iovec, seqnum, ret, offset);
cec736d2 1756
fa6ac760
LP
1757 /* If the memory mapping triggered a SIGBUS then we return an
1758 * IO error and ignore the error code passed down to us, since
1759 * it is very likely just an effect of a nullified replacement
1760 * mapping page */
1761
1762 if (mmap_cache_got_sigbus(f->mmap, f->fd))
1763 r = -EIO;
1764
7a24f3bf
VC
1765 if (f->post_change_timer)
1766 schedule_post_change(f);
1767 else
1768 journal_file_post_change(f);
50f20cfd 1769
cec736d2
LP
1770 return r;
1771}
1772
a4bcff5b 1773typedef struct ChainCacheItem {
fb099c8d 1774 uint64_t first; /* the array at the beginning of the chain */
a4bcff5b
LP
1775 uint64_t array; /* the cached array */
1776 uint64_t begin; /* the first item in the cached array */
1777 uint64_t total; /* the total number of items in all arrays before this one in the chain */
f268980d 1778 uint64_t last_index; /* the last index we looked at, to optimize locality when bisecting */
a4bcff5b
LP
1779} ChainCacheItem;
1780
1781static void chain_cache_put(
4743015d 1782 OrderedHashmap *h,
a4bcff5b
LP
1783 ChainCacheItem *ci,
1784 uint64_t first,
1785 uint64_t array,
1786 uint64_t begin,
f268980d
LP
1787 uint64_t total,
1788 uint64_t last_index) {
a4bcff5b
LP
1789
1790 if (!ci) {
34741aa3
LP
1791 /* If the chain item to cache for this chain is the
1792 * first one it's not worth caching anything */
1793 if (array == first)
1794 return;
1795
29433089 1796 if (ordered_hashmap_size(h) >= CHAIN_CACHE_MAX) {
4743015d 1797 ci = ordered_hashmap_steal_first(h);
29433089
LP
1798 assert(ci);
1799 } else {
a4bcff5b
LP
1800 ci = new(ChainCacheItem, 1);
1801 if (!ci)
1802 return;
1803 }
1804
1805 ci->first = first;
1806
4743015d 1807 if (ordered_hashmap_put(h, &ci->first, ci) < 0) {
a4bcff5b
LP
1808 free(ci);
1809 return;
1810 }
1811 } else
1812 assert(ci->first == first);
1813
1814 ci->array = array;
1815 ci->begin = begin;
1816 ci->total = total;
f268980d 1817 ci->last_index = last_index;
a4bcff5b
LP
1818}
1819
f268980d
LP
1820static int generic_array_get(
1821 JournalFile *f,
1822 uint64_t first,
1823 uint64_t i,
1824 Object **ret, uint64_t *offset) {
de190aef 1825
cec736d2 1826 Object *o;
a4bcff5b 1827 uint64_t p = 0, a, t = 0;
cec736d2 1828 int r;
a4bcff5b 1829 ChainCacheItem *ci;
cec736d2
LP
1830
1831 assert(f);
1832
de190aef 1833 a = first;
a4bcff5b
LP
1834
1835 /* Try the chain cache first */
4743015d 1836 ci = ordered_hashmap_get(f->chain_cache, &first);
a4bcff5b
LP
1837 if (ci && i > ci->total) {
1838 a = ci->array;
1839 i -= ci->total;
1840 t = ci->total;
1841 }
1842
de190aef 1843 while (a > 0) {
a4bcff5b 1844 uint64_t k;
cec736d2 1845
de190aef
LP
1846 r = journal_file_move_to_object(f, OBJECT_ENTRY_ARRAY, a, &o);
1847 if (r < 0)
1848 return r;
cec736d2 1849
a4bcff5b
LP
1850 k = journal_file_entry_array_n_items(o);
1851 if (i < k) {
de190aef 1852 p = le64toh(o->entry_array.items[i]);
a4bcff5b 1853 goto found;
cec736d2
LP
1854 }
1855
a4bcff5b
LP
1856 i -= k;
1857 t += k;
de190aef
LP
1858 a = le64toh(o->entry_array.next_entry_array_offset);
1859 }
1860
a4bcff5b
LP
1861 return 0;
1862
1863found:
1864 /* Let's cache this item for the next invocation */
af13a6b0 1865 chain_cache_put(f->chain_cache, ci, first, a, le64toh(o->entry_array.items[0]), t, i);
de190aef
LP
1866
1867 r = journal_file_move_to_object(f, OBJECT_ENTRY, p, &o);
1868 if (r < 0)
1869 return r;
1870
1871 if (ret)
1872 *ret = o;
1873
1874 if (offset)
1875 *offset = p;
1876
1877 return 1;
1878}
1879
f268980d
LP
1880static int generic_array_get_plus_one(
1881 JournalFile *f,
1882 uint64_t extra,
1883 uint64_t first,
1884 uint64_t i,
1885 Object **ret, uint64_t *offset) {
de190aef
LP
1886
1887 Object *o;
1888
1889 assert(f);
1890
1891 if (i == 0) {
1892 int r;
1893
1894 r = journal_file_move_to_object(f, OBJECT_ENTRY, extra, &o);
cec736d2
LP
1895 if (r < 0)
1896 return r;
1897
de190aef
LP
1898 if (ret)
1899 *ret = o;
cec736d2 1900
de190aef
LP
1901 if (offset)
1902 *offset = extra;
cec736d2 1903
de190aef 1904 return 1;
cec736d2
LP
1905 }
1906
de190aef
LP
1907 return generic_array_get(f, first, i-1, ret, offset);
1908}
cec736d2 1909
de190aef
LP
1910enum {
1911 TEST_FOUND,
1912 TEST_LEFT,
1913 TEST_RIGHT
1914};
cec736d2 1915
f268980d
LP
1916static int generic_array_bisect(
1917 JournalFile *f,
1918 uint64_t first,
1919 uint64_t n,
1920 uint64_t needle,
1921 int (*test_object)(JournalFile *f, uint64_t p, uint64_t needle),
1922 direction_t direction,
1923 Object **ret,
1924 uint64_t *offset,
1925 uint64_t *idx) {
1926
1927 uint64_t a, p, t = 0, i = 0, last_p = 0, last_index = (uint64_t) -1;
de190aef
LP
1928 bool subtract_one = false;
1929 Object *o, *array = NULL;
1930 int r;
a4bcff5b 1931 ChainCacheItem *ci;
cec736d2 1932
de190aef
LP
1933 assert(f);
1934 assert(test_object);
cec736d2 1935
a4bcff5b 1936 /* Start with the first array in the chain */
de190aef 1937 a = first;
a4bcff5b 1938
4743015d 1939 ci = ordered_hashmap_get(f->chain_cache, &first);
a4bcff5b
LP
1940 if (ci && n > ci->total) {
1941 /* Ah, we have iterated this bisection array chain
1942 * previously! Let's see if we can skip ahead in the
1943 * chain, as far as the last time. But we can't jump
1944 * backwards in the chain, so let's check that
1945 * first. */
1946
1947 r = test_object(f, ci->begin, needle);
1948 if (r < 0)
1949 return r;
1950
1951 if (r == TEST_LEFT) {
f268980d 1952 /* OK, what we are looking for is right of the
a4bcff5b
LP
1953 * begin of this EntryArray, so let's jump
1954 * straight to previously cached array in the
1955 * chain */
1956
1957 a = ci->array;
1958 n -= ci->total;
1959 t = ci->total;
f268980d 1960 last_index = ci->last_index;
a4bcff5b
LP
1961 }
1962 }
1963
de190aef
LP
1964 while (a > 0) {
1965 uint64_t left, right, k, lp;
1966
1967 r = journal_file_move_to_object(f, OBJECT_ENTRY_ARRAY, a, &array);
cec736d2
LP
1968 if (r < 0)
1969 return r;
1970
de190aef
LP
1971 k = journal_file_entry_array_n_items(array);
1972 right = MIN(k, n);
1973 if (right <= 0)
1974 return 0;
cec736d2 1975
de190aef
LP
1976 i = right - 1;
1977 lp = p = le64toh(array->entry_array.items[i]);
1978 if (p <= 0)
1979 return -EBADMSG;
cec736d2 1980
de190aef
LP
1981 r = test_object(f, p, needle);
1982 if (r < 0)
1983 return r;
cec736d2 1984
de190aef
LP
1985 if (r == TEST_FOUND)
1986 r = direction == DIRECTION_DOWN ? TEST_RIGHT : TEST_LEFT;
1987
1988 if (r == TEST_RIGHT) {
1989 left = 0;
1990 right -= 1;
f268980d
LP
1991
1992 if (last_index != (uint64_t) -1) {
1993 assert(last_index <= right);
1994
1995 /* If we cached the last index we
1996 * looked at, let's try to not to jump
1997 * too wildly around and see if we can
1998 * limit the range to look at early to
1999 * the immediate neighbors of the last
2000 * index we looked at. */
2001
2002 if (last_index > 0) {
2003 uint64_t x = last_index - 1;
2004
2005 p = le64toh(array->entry_array.items[x]);
2006 if (p <= 0)
2007 return -EBADMSG;
2008
2009 r = test_object(f, p, needle);
2010 if (r < 0)
2011 return r;
2012
2013 if (r == TEST_FOUND)
2014 r = direction == DIRECTION_DOWN ? TEST_RIGHT : TEST_LEFT;
2015
2016 if (r == TEST_RIGHT)
2017 right = x;
2018 else
2019 left = x + 1;
2020 }
2021
2022 if (last_index < right) {
2023 uint64_t y = last_index + 1;
2024
2025 p = le64toh(array->entry_array.items[y]);
2026 if (p <= 0)
2027 return -EBADMSG;
2028
2029 r = test_object(f, p, needle);
2030 if (r < 0)
2031 return r;
2032
2033 if (r == TEST_FOUND)
2034 r = direction == DIRECTION_DOWN ? TEST_RIGHT : TEST_LEFT;
2035
2036 if (r == TEST_RIGHT)
2037 right = y;
2038 else
2039 left = y + 1;
2040 }
f268980d
LP
2041 }
2042
de190aef
LP
2043 for (;;) {
2044 if (left == right) {
2045 if (direction == DIRECTION_UP)
2046 subtract_one = true;
2047
2048 i = left;
2049 goto found;
2050 }
2051
2052 assert(left < right);
de190aef 2053 i = (left + right) / 2;
f268980d 2054
de190aef
LP
2055 p = le64toh(array->entry_array.items[i]);
2056 if (p <= 0)
2057 return -EBADMSG;
2058
2059 r = test_object(f, p, needle);
2060 if (r < 0)
2061 return r;
cec736d2 2062
de190aef
LP
2063 if (r == TEST_FOUND)
2064 r = direction == DIRECTION_DOWN ? TEST_RIGHT : TEST_LEFT;
2065
2066 if (r == TEST_RIGHT)
2067 right = i;
2068 else
2069 left = i + 1;
2070 }
2071 }
2072
2173cbf8 2073 if (k >= n) {
cbdca852
LP
2074 if (direction == DIRECTION_UP) {
2075 i = n;
2076 subtract_one = true;
2077 goto found;
2078 }
2079
cec736d2 2080 return 0;
cbdca852 2081 }
cec736d2 2082
de190aef
LP
2083 last_p = lp;
2084
2085 n -= k;
2086 t += k;
f268980d 2087 last_index = (uint64_t) -1;
de190aef 2088 a = le64toh(array->entry_array.next_entry_array_offset);
cec736d2
LP
2089 }
2090
2091 return 0;
de190aef
LP
2092
2093found:
2094 if (subtract_one && t == 0 && i == 0)
2095 return 0;
2096
a4bcff5b 2097 /* Let's cache this item for the next invocation */
af13a6b0 2098 chain_cache_put(f->chain_cache, ci, first, a, le64toh(array->entry_array.items[0]), t, subtract_one ? (i > 0 ? i-1 : (uint64_t) -1) : i);
a4bcff5b 2099
de190aef
LP
2100 if (subtract_one && i == 0)
2101 p = last_p;
2102 else if (subtract_one)
2103 p = le64toh(array->entry_array.items[i-1]);
2104 else
2105 p = le64toh(array->entry_array.items[i]);
2106
2107 r = journal_file_move_to_object(f, OBJECT_ENTRY, p, &o);
2108 if (r < 0)
2109 return r;
2110
2111 if (ret)
2112 *ret = o;
2113
2114 if (offset)
2115 *offset = p;
2116
2117 if (idx)
cbdca852 2118 *idx = t + i + (subtract_one ? -1 : 0);
de190aef
LP
2119
2120 return 1;
cec736d2
LP
2121}
2122
f268980d
LP
2123static int generic_array_bisect_plus_one(
2124 JournalFile *f,
2125 uint64_t extra,
2126 uint64_t first,
2127 uint64_t n,
2128 uint64_t needle,
2129 int (*test_object)(JournalFile *f, uint64_t p, uint64_t needle),
2130 direction_t direction,
2131 Object **ret,
2132 uint64_t *offset,
2133 uint64_t *idx) {
de190aef 2134
cec736d2 2135 int r;
cbdca852
LP
2136 bool step_back = false;
2137 Object *o;
cec736d2
LP
2138
2139 assert(f);
de190aef 2140 assert(test_object);
cec736d2 2141
de190aef
LP
2142 if (n <= 0)
2143 return 0;
cec736d2 2144
de190aef
LP
2145 /* This bisects the array in object 'first', but first checks
2146 * an extra */
de190aef
LP
2147 r = test_object(f, extra, needle);
2148 if (r < 0)
2149 return r;
a536e261
LP
2150
2151 if (r == TEST_FOUND)
2152 r = direction == DIRECTION_DOWN ? TEST_RIGHT : TEST_LEFT;
2153
cbdca852
LP
2154 /* if we are looking with DIRECTION_UP then we need to first
2155 see if in the actual array there is a matching entry, and
2156 return the last one of that. But if there isn't any we need
2157 to return this one. Hence remember this, and return it
2158 below. */
2159 if (r == TEST_LEFT)
2160 step_back = direction == DIRECTION_UP;
de190aef 2161
cbdca852
LP
2162 if (r == TEST_RIGHT) {
2163 if (direction == DIRECTION_DOWN)
2164 goto found;
2165 else
2166 return 0;
a536e261 2167 }
cec736d2 2168
de190aef
LP
2169 r = generic_array_bisect(f, first, n-1, needle, test_object, direction, ret, offset, idx);
2170
cbdca852
LP
2171 if (r == 0 && step_back)
2172 goto found;
2173
ecf68b1d 2174 if (r > 0 && idx)
313cefa1 2175 (*idx)++;
de190aef
LP
2176
2177 return r;
cbdca852
LP
2178
2179found:
2180 r = journal_file_move_to_object(f, OBJECT_ENTRY, extra, &o);
2181 if (r < 0)
2182 return r;
2183
2184 if (ret)
2185 *ret = o;
2186
2187 if (offset)
2188 *offset = extra;
2189
2190 if (idx)
2191 *idx = 0;
2192
2193 return 1;
2194}
2195
44a6b1b6 2196_pure_ static int test_object_offset(JournalFile *f, uint64_t p, uint64_t needle) {
cbdca852
LP
2197 assert(f);
2198 assert(p > 0);
2199
2200 if (p == needle)
2201 return TEST_FOUND;
2202 else if (p < needle)
2203 return TEST_LEFT;
2204 else
2205 return TEST_RIGHT;
2206}
2207
de190aef
LP
2208static int test_object_seqnum(JournalFile *f, uint64_t p, uint64_t needle) {
2209 Object *o;
2210 int r;
2211
2212 assert(f);
2213 assert(p > 0);
2214
2215 r = journal_file_move_to_object(f, OBJECT_ENTRY, p, &o);
cec736d2
LP
2216 if (r < 0)
2217 return r;
2218
de190aef
LP
2219 if (le64toh(o->entry.seqnum) == needle)
2220 return TEST_FOUND;
2221 else if (le64toh(o->entry.seqnum) < needle)
2222 return TEST_LEFT;
2223 else
2224 return TEST_RIGHT;
2225}
cec736d2 2226
de190aef
LP
2227int journal_file_move_to_entry_by_seqnum(
2228 JournalFile *f,
2229 uint64_t seqnum,
2230 direction_t direction,
2231 Object **ret,
2232 uint64_t *offset) {
c88cc6af
VC
2233 assert(f);
2234 assert(f->header);
de190aef
LP
2235
2236 return generic_array_bisect(f,
2237 le64toh(f->header->entry_array_offset),
2238 le64toh(f->header->n_entries),
2239 seqnum,
2240 test_object_seqnum,
2241 direction,
2242 ret, offset, NULL);
2243}
cec736d2 2244
de190aef
LP
2245static int test_object_realtime(JournalFile *f, uint64_t p, uint64_t needle) {
2246 Object *o;
2247 int r;
2248
2249 assert(f);
2250 assert(p > 0);
2251
2252 r = journal_file_move_to_object(f, OBJECT_ENTRY, p, &o);
2253 if (r < 0)
2254 return r;
2255
2256 if (le64toh(o->entry.realtime) == needle)
2257 return TEST_FOUND;
2258 else if (le64toh(o->entry.realtime) < needle)
2259 return TEST_LEFT;
2260 else
2261 return TEST_RIGHT;
cec736d2
LP
2262}
2263
de190aef
LP
2264int journal_file_move_to_entry_by_realtime(
2265 JournalFile *f,
2266 uint64_t realtime,
2267 direction_t direction,
2268 Object **ret,
2269 uint64_t *offset) {
c88cc6af
VC
2270 assert(f);
2271 assert(f->header);
de190aef
LP
2272
2273 return generic_array_bisect(f,
2274 le64toh(f->header->entry_array_offset),
2275 le64toh(f->header->n_entries),
2276 realtime,
2277 test_object_realtime,
2278 direction,
2279 ret, offset, NULL);
2280}
2281
2282static int test_object_monotonic(JournalFile *f, uint64_t p, uint64_t needle) {
2283 Object *o;
2284 int r;
2285
2286 assert(f);
2287 assert(p > 0);
2288
2289 r = journal_file_move_to_object(f, OBJECT_ENTRY, p, &o);
2290 if (r < 0)
2291 return r;
2292
2293 if (le64toh(o->entry.monotonic) == needle)
2294 return TEST_FOUND;
2295 else if (le64toh(o->entry.monotonic) < needle)
2296 return TEST_LEFT;
2297 else
2298 return TEST_RIGHT;
2299}
2300
2a560338 2301static int find_data_object_by_boot_id(
47838ab3
ZJS
2302 JournalFile *f,
2303 sd_id128_t boot_id,
2304 Object **o,
2305 uint64_t *b) {
2a560338 2306
47838ab3
ZJS
2307 char t[sizeof("_BOOT_ID=")-1 + 32 + 1] = "_BOOT_ID=";
2308
2309 sd_id128_to_string(boot_id, t + 9);
2310 return journal_file_find_data_object(f, t, sizeof(t) - 1, o, b);
2311}
2312
de190aef
LP
2313int journal_file_move_to_entry_by_monotonic(
2314 JournalFile *f,
2315 sd_id128_t boot_id,
2316 uint64_t monotonic,
2317 direction_t direction,
2318 Object **ret,
2319 uint64_t *offset) {
2320
de190aef
LP
2321 Object *o;
2322 int r;
2323
cbdca852 2324 assert(f);
de190aef 2325
47838ab3 2326 r = find_data_object_by_boot_id(f, boot_id, &o, NULL);
de190aef
LP
2327 if (r < 0)
2328 return r;
cbdca852 2329 if (r == 0)
de190aef
LP
2330 return -ENOENT;
2331
2332 return generic_array_bisect_plus_one(f,
2333 le64toh(o->data.entry_offset),
2334 le64toh(o->data.entry_array_offset),
2335 le64toh(o->data.n_entries),
2336 monotonic,
2337 test_object_monotonic,
2338 direction,
2339 ret, offset, NULL);
2340}
2341
1fc605b0 2342void journal_file_reset_location(JournalFile *f) {
6573ef05 2343 f->location_type = LOCATION_HEAD;
1fc605b0 2344 f->current_offset = 0;
6573ef05
MS
2345 f->current_seqnum = 0;
2346 f->current_realtime = 0;
2347 f->current_monotonic = 0;
2348 zero(f->current_boot_id);
2349 f->current_xor_hash = 0;
2350}
2351
950c07d4 2352void journal_file_save_location(JournalFile *f, Object *o, uint64_t offset) {
6573ef05
MS
2353 f->location_type = LOCATION_SEEK;
2354 f->current_offset = offset;
2355 f->current_seqnum = le64toh(o->entry.seqnum);
2356 f->current_realtime = le64toh(o->entry.realtime);
2357 f->current_monotonic = le64toh(o->entry.monotonic);
2358 f->current_boot_id = o->entry.boot_id;
2359 f->current_xor_hash = le64toh(o->entry.xor_hash);
1fc605b0
MS
2360}
2361
d8ae66d7
MS
2362int journal_file_compare_locations(JournalFile *af, JournalFile *bf) {
2363 assert(af);
c88cc6af 2364 assert(af->header);
d8ae66d7 2365 assert(bf);
c88cc6af 2366 assert(bf->header);
d8ae66d7
MS
2367 assert(af->location_type == LOCATION_SEEK);
2368 assert(bf->location_type == LOCATION_SEEK);
2369
2370 /* If contents and timestamps match, these entries are
2371 * identical, even if the seqnum does not match */
2372 if (sd_id128_equal(af->current_boot_id, bf->current_boot_id) &&
2373 af->current_monotonic == bf->current_monotonic &&
2374 af->current_realtime == bf->current_realtime &&
2375 af->current_xor_hash == bf->current_xor_hash)
2376 return 0;
2377
2378 if (sd_id128_equal(af->header->seqnum_id, bf->header->seqnum_id)) {
2379
2380 /* If this is from the same seqnum source, compare
2381 * seqnums */
2382 if (af->current_seqnum < bf->current_seqnum)
2383 return -1;
2384 if (af->current_seqnum > bf->current_seqnum)
2385 return 1;
2386
2387 /* Wow! This is weird, different data but the same
2388 * seqnums? Something is borked, but let's make the
2389 * best of it and compare by time. */
2390 }
2391
2392 if (sd_id128_equal(af->current_boot_id, bf->current_boot_id)) {
2393
2394 /* If the boot id matches, compare monotonic time */
2395 if (af->current_monotonic < bf->current_monotonic)
2396 return -1;
2397 if (af->current_monotonic > bf->current_monotonic)
2398 return 1;
2399 }
2400
2401 /* Otherwise, compare UTC time */
2402 if (af->current_realtime < bf->current_realtime)
2403 return -1;
2404 if (af->current_realtime > bf->current_realtime)
2405 return 1;
2406
2407 /* Finally, compare by contents */
2408 if (af->current_xor_hash < bf->current_xor_hash)
2409 return -1;
2410 if (af->current_xor_hash > bf->current_xor_hash)
2411 return 1;
2412
2413 return 0;
2414}
2415
de190aef
LP
2416int journal_file_next_entry(
2417 JournalFile *f,
f534928a 2418 uint64_t p,
de190aef
LP
2419 direction_t direction,
2420 Object **ret, uint64_t *offset) {
2421
fb099c8d 2422 uint64_t i, n, ofs;
cec736d2
LP
2423 int r;
2424
2425 assert(f);
c88cc6af 2426 assert(f->header);
de190aef
LP
2427
2428 n = le64toh(f->header->n_entries);
2429 if (n <= 0)
2430 return 0;
cec736d2 2431
f534928a 2432 if (p == 0)
de190aef 2433 i = direction == DIRECTION_DOWN ? 0 : n - 1;
cec736d2 2434 else {
de190aef
LP
2435 r = generic_array_bisect(f,
2436 le64toh(f->header->entry_array_offset),
2437 le64toh(f->header->n_entries),
2438 p,
2439 test_object_offset,
2440 DIRECTION_DOWN,
2441 NULL, NULL,
2442 &i);
2443 if (r <= 0)
2444 return r;
2445
2446 if (direction == DIRECTION_DOWN) {
2447 if (i >= n - 1)
2448 return 0;
2449
2450 i++;
2451 } else {
2452 if (i <= 0)
2453 return 0;
2454
2455 i--;
2456 }
cec736d2
LP
2457 }
2458
de190aef 2459 /* And jump to it */
fb099c8d
ZJS
2460 r = generic_array_get(f,
2461 le64toh(f->header->entry_array_offset),
2462 i,
2463 ret, &ofs);
2464 if (r <= 0)
2465 return r;
2466
2467 if (p > 0 &&
2468 (direction == DIRECTION_DOWN ? ofs <= p : ofs >= p)) {
2469 log_debug("%s: entry array corrupted at entry %"PRIu64,
2470 f->path, i);
2471 return -EBADMSG;
2472 }
2473
2474 if (offset)
2475 *offset = ofs;
2476
2477 return 1;
de190aef 2478}
cec736d2 2479
de190aef
LP
2480int journal_file_next_entry_for_data(
2481 JournalFile *f,
2482 Object *o, uint64_t p,
2483 uint64_t data_offset,
2484 direction_t direction,
2485 Object **ret, uint64_t *offset) {
2486
2487 uint64_t n, i;
cec736d2 2488 int r;
de190aef 2489 Object *d;
cec736d2
LP
2490
2491 assert(f);
de190aef 2492 assert(p > 0 || !o);
cec736d2 2493
de190aef 2494 r = journal_file_move_to_object(f, OBJECT_DATA, data_offset, &d);
466ccd92 2495 if (r < 0)
de190aef 2496 return r;
cec736d2 2497
de190aef
LP
2498 n = le64toh(d->data.n_entries);
2499 if (n <= 0)
2500 return n;
cec736d2 2501
de190aef
LP
2502 if (!o)
2503 i = direction == DIRECTION_DOWN ? 0 : n - 1;
2504 else {
2505 if (o->object.type != OBJECT_ENTRY)
2506 return -EINVAL;
cec736d2 2507
de190aef
LP
2508 r = generic_array_bisect_plus_one(f,
2509 le64toh(d->data.entry_offset),
2510 le64toh(d->data.entry_array_offset),
2511 le64toh(d->data.n_entries),
2512 p,
2513 test_object_offset,
2514 DIRECTION_DOWN,
2515 NULL, NULL,
2516 &i);
2517
2518 if (r <= 0)
cec736d2
LP
2519 return r;
2520
de190aef
LP
2521 if (direction == DIRECTION_DOWN) {
2522 if (i >= n - 1)
2523 return 0;
cec736d2 2524
de190aef
LP
2525 i++;
2526 } else {
2527 if (i <= 0)
2528 return 0;
cec736d2 2529
de190aef
LP
2530 i--;
2531 }
cec736d2 2532
de190aef 2533 }
cec736d2 2534
de190aef
LP
2535 return generic_array_get_plus_one(f,
2536 le64toh(d->data.entry_offset),
2537 le64toh(d->data.entry_array_offset),
2538 i,
2539 ret, offset);
2540}
cec736d2 2541
cbdca852
LP
2542int journal_file_move_to_entry_by_offset_for_data(
2543 JournalFile *f,
2544 uint64_t data_offset,
2545 uint64_t p,
2546 direction_t direction,
2547 Object **ret, uint64_t *offset) {
2548
2549 int r;
2550 Object *d;
2551
2552 assert(f);
2553
2554 r = journal_file_move_to_object(f, OBJECT_DATA, data_offset, &d);
2555 if (r < 0)
2556 return r;
2557
2558 return generic_array_bisect_plus_one(f,
2559 le64toh(d->data.entry_offset),
2560 le64toh(d->data.entry_array_offset),
2561 le64toh(d->data.n_entries),
2562 p,
2563 test_object_offset,
2564 direction,
2565 ret, offset, NULL);
2566}
2567
2568int journal_file_move_to_entry_by_monotonic_for_data(
2569 JournalFile *f,
2570 uint64_t data_offset,
2571 sd_id128_t boot_id,
2572 uint64_t monotonic,
2573 direction_t direction,
2574 Object **ret, uint64_t *offset) {
2575
cbdca852
LP
2576 Object *o, *d;
2577 int r;
2578 uint64_t b, z;
2579
2580 assert(f);
2581
2582 /* First, seek by time */
47838ab3 2583 r = find_data_object_by_boot_id(f, boot_id, &o, &b);
cbdca852
LP
2584 if (r < 0)
2585 return r;
2586 if (r == 0)
2587 return -ENOENT;
2588
2589 r = generic_array_bisect_plus_one(f,
2590 le64toh(o->data.entry_offset),
2591 le64toh(o->data.entry_array_offset),
2592 le64toh(o->data.n_entries),
2593 monotonic,
2594 test_object_monotonic,
2595 direction,
2596 NULL, &z, NULL);
2597 if (r <= 0)
2598 return r;
2599
2600 /* And now, continue seeking until we find an entry that
2601 * exists in both bisection arrays */
2602
2603 for (;;) {
2604 Object *qo;
2605 uint64_t p, q;
2606
2607 r = journal_file_move_to_object(f, OBJECT_DATA, data_offset, &d);
2608 if (r < 0)
2609 return r;
2610
2611 r = generic_array_bisect_plus_one(f,
2612 le64toh(d->data.entry_offset),
2613 le64toh(d->data.entry_array_offset),
2614 le64toh(d->data.n_entries),
2615 z,
2616 test_object_offset,
2617 direction,
2618 NULL, &p, NULL);
2619 if (r <= 0)
2620 return r;
2621
2622 r = journal_file_move_to_object(f, OBJECT_DATA, b, &o);
2623 if (r < 0)
2624 return r;
2625
2626 r = generic_array_bisect_plus_one(f,
2627 le64toh(o->data.entry_offset),
2628 le64toh(o->data.entry_array_offset),
2629 le64toh(o->data.n_entries),
2630 p,
2631 test_object_offset,
2632 direction,
2633 &qo, &q, NULL);
2634
2635 if (r <= 0)
2636 return r;
2637
2638 if (p == q) {
2639 if (ret)
2640 *ret = qo;
2641 if (offset)
2642 *offset = q;
2643
2644 return 1;
2645 }
2646
2647 z = q;
2648 }
cbdca852
LP
2649}
2650
de190aef
LP
2651int journal_file_move_to_entry_by_seqnum_for_data(
2652 JournalFile *f,
2653 uint64_t data_offset,
2654 uint64_t seqnum,
2655 direction_t direction,
2656 Object **ret, uint64_t *offset) {
cec736d2 2657
de190aef
LP
2658 Object *d;
2659 int r;
cec736d2 2660
91a31dde
LP
2661 assert(f);
2662
de190aef 2663 r = journal_file_move_to_object(f, OBJECT_DATA, data_offset, &d);
91a31dde 2664 if (r < 0)
de190aef 2665 return r;
cec736d2 2666
de190aef
LP
2667 return generic_array_bisect_plus_one(f,
2668 le64toh(d->data.entry_offset),
2669 le64toh(d->data.entry_array_offset),
2670 le64toh(d->data.n_entries),
2671 seqnum,
2672 test_object_seqnum,
2673 direction,
2674 ret, offset, NULL);
2675}
cec736d2 2676
de190aef
LP
2677int journal_file_move_to_entry_by_realtime_for_data(
2678 JournalFile *f,
2679 uint64_t data_offset,
2680 uint64_t realtime,
2681 direction_t direction,
2682 Object **ret, uint64_t *offset) {
2683
2684 Object *d;
2685 int r;
2686
91a31dde
LP
2687 assert(f);
2688
de190aef 2689 r = journal_file_move_to_object(f, OBJECT_DATA, data_offset, &d);
91a31dde 2690 if (r < 0)
de190aef
LP
2691 return r;
2692
2693 return generic_array_bisect_plus_one(f,
2694 le64toh(d->data.entry_offset),
2695 le64toh(d->data.entry_array_offset),
2696 le64toh(d->data.n_entries),
2697 realtime,
2698 test_object_realtime,
2699 direction,
2700 ret, offset, NULL);
cec736d2
LP
2701}
2702
0284adc6 2703void journal_file_dump(JournalFile *f) {
7560fffc 2704 Object *o;
7560fffc 2705 int r;
0284adc6 2706 uint64_t p;
7560fffc
LP
2707
2708 assert(f);
c88cc6af 2709 assert(f->header);
7560fffc 2710
0284adc6 2711 journal_file_print_header(f);
7560fffc 2712
0284adc6
LP
2713 p = le64toh(f->header->header_size);
2714 while (p != 0) {
d05089d8 2715 r = journal_file_move_to_object(f, OBJECT_UNUSED, p, &o);
0284adc6
LP
2716 if (r < 0)
2717 goto fail;
7560fffc 2718
0284adc6 2719 switch (o->object.type) {
d98cc1f2 2720
0284adc6
LP
2721 case OBJECT_UNUSED:
2722 printf("Type: OBJECT_UNUSED\n");
2723 break;
d98cc1f2 2724
0284adc6
LP
2725 case OBJECT_DATA:
2726 printf("Type: OBJECT_DATA\n");
2727 break;
7560fffc 2728
3c1668da
LP
2729 case OBJECT_FIELD:
2730 printf("Type: OBJECT_FIELD\n");
2731 break;
2732
0284adc6 2733 case OBJECT_ENTRY:
507f22bd
ZJS
2734 printf("Type: OBJECT_ENTRY seqnum=%"PRIu64" monotonic=%"PRIu64" realtime=%"PRIu64"\n",
2735 le64toh(o->entry.seqnum),
2736 le64toh(o->entry.monotonic),
2737 le64toh(o->entry.realtime));
0284adc6 2738 break;
7560fffc 2739
0284adc6
LP
2740 case OBJECT_FIELD_HASH_TABLE:
2741 printf("Type: OBJECT_FIELD_HASH_TABLE\n");
2742 break;
7560fffc 2743
0284adc6
LP
2744 case OBJECT_DATA_HASH_TABLE:
2745 printf("Type: OBJECT_DATA_HASH_TABLE\n");
2746 break;
7560fffc 2747
0284adc6
LP
2748 case OBJECT_ENTRY_ARRAY:
2749 printf("Type: OBJECT_ENTRY_ARRAY\n");
2750 break;
7560fffc 2751
0284adc6 2752 case OBJECT_TAG:
507f22bd
ZJS
2753 printf("Type: OBJECT_TAG seqnum=%"PRIu64" epoch=%"PRIu64"\n",
2754 le64toh(o->tag.seqnum),
2755 le64toh(o->tag.epoch));
0284adc6 2756 break;
3c1668da
LP
2757
2758 default:
8facc349 2759 printf("Type: unknown (%i)\n", o->object.type);
3c1668da 2760 break;
0284adc6 2761 }
7560fffc 2762
d89c8fdf
ZJS
2763 if (o->object.flags & OBJECT_COMPRESSION_MASK)
2764 printf("Flags: %s\n",
2765 object_compressed_to_string(o->object.flags & OBJECT_COMPRESSION_MASK));
7560fffc 2766
0284adc6
LP
2767 if (p == le64toh(f->header->tail_object_offset))
2768 p = 0;
2769 else
2770 p = p + ALIGN64(le64toh(o->object.size));
2771 }
7560fffc 2772
0284adc6
LP
2773 return;
2774fail:
2775 log_error("File corrupt");
7560fffc
LP
2776}
2777
718fe4b1
ZJS
2778static const char* format_timestamp_safe(char *buf, size_t l, usec_t t) {
2779 const char *x;
2780
2781 x = format_timestamp(buf, l, t);
2782 if (x)
2783 return x;
2784 return " --- ";
2785}
2786
0284adc6 2787void journal_file_print_header(JournalFile *f) {
2765b7bb 2788 char a[33], b[33], c[33], d[33];
ed375beb 2789 char x[FORMAT_TIMESTAMP_MAX], y[FORMAT_TIMESTAMP_MAX], z[FORMAT_TIMESTAMP_MAX];
a1a03e30
LP
2790 struct stat st;
2791 char bytes[FORMAT_BYTES_MAX];
7560fffc
LP
2792
2793 assert(f);
c88cc6af 2794 assert(f->header);
7560fffc 2795
0284adc6
LP
2796 printf("File Path: %s\n"
2797 "File ID: %s\n"
2798 "Machine ID: %s\n"
2799 "Boot ID: %s\n"
2800 "Sequential Number ID: %s\n"
2801 "State: %s\n"
2802 "Compatible Flags:%s%s\n"
d89c8fdf 2803 "Incompatible Flags:%s%s%s\n"
507f22bd
ZJS
2804 "Header size: %"PRIu64"\n"
2805 "Arena size: %"PRIu64"\n"
2806 "Data Hash Table Size: %"PRIu64"\n"
2807 "Field Hash Table Size: %"PRIu64"\n"
0284adc6 2808 "Rotate Suggested: %s\n"
507f22bd
ZJS
2809 "Head Sequential Number: %"PRIu64"\n"
2810 "Tail Sequential Number: %"PRIu64"\n"
0284adc6 2811 "Head Realtime Timestamp: %s\n"
3223f44f 2812 "Tail Realtime Timestamp: %s\n"
ed375beb 2813 "Tail Monotonic Timestamp: %s\n"
507f22bd
ZJS
2814 "Objects: %"PRIu64"\n"
2815 "Entry Objects: %"PRIu64"\n",
0284adc6
LP
2816 f->path,
2817 sd_id128_to_string(f->header->file_id, a),
2818 sd_id128_to_string(f->header->machine_id, b),
2819 sd_id128_to_string(f->header->boot_id, c),
2765b7bb 2820 sd_id128_to_string(f->header->seqnum_id, d),
3223f44f
LP
2821 f->header->state == STATE_OFFLINE ? "OFFLINE" :
2822 f->header->state == STATE_ONLINE ? "ONLINE" :
2823 f->header->state == STATE_ARCHIVED ? "ARCHIVED" : "UNKNOWN",
8088cbd3 2824 JOURNAL_HEADER_SEALED(f->header) ? " SEALED" : "",
d89c8fdf
ZJS
2825 (le32toh(f->header->compatible_flags) & ~HEADER_COMPATIBLE_ANY) ? " ???" : "",
2826 JOURNAL_HEADER_COMPRESSED_XZ(f->header) ? " COMPRESSED-XZ" : "",
2827 JOURNAL_HEADER_COMPRESSED_LZ4(f->header) ? " COMPRESSED-LZ4" : "",
2828 (le32toh(f->header->incompatible_flags) & ~HEADER_INCOMPATIBLE_ANY) ? " ???" : "",
507f22bd
ZJS
2829 le64toh(f->header->header_size),
2830 le64toh(f->header->arena_size),
2831 le64toh(f->header->data_hash_table_size) / sizeof(HashItem),
2832 le64toh(f->header->field_hash_table_size) / sizeof(HashItem),
fb0951b0 2833 yes_no(journal_file_rotate_suggested(f, 0)),
507f22bd
ZJS
2834 le64toh(f->header->head_entry_seqnum),
2835 le64toh(f->header->tail_entry_seqnum),
718fe4b1
ZJS
2836 format_timestamp_safe(x, sizeof(x), le64toh(f->header->head_entry_realtime)),
2837 format_timestamp_safe(y, sizeof(y), le64toh(f->header->tail_entry_realtime)),
ed375beb 2838 format_timespan(z, sizeof(z), le64toh(f->header->tail_entry_monotonic), USEC_PER_MSEC),
507f22bd
ZJS
2839 le64toh(f->header->n_objects),
2840 le64toh(f->header->n_entries));
7560fffc 2841
0284adc6 2842 if (JOURNAL_HEADER_CONTAINS(f->header, n_data))
507f22bd 2843 printf("Data Objects: %"PRIu64"\n"
0284adc6 2844 "Data Hash Table Fill: %.1f%%\n",
507f22bd 2845 le64toh(f->header->n_data),
0284adc6 2846 100.0 * (double) le64toh(f->header->n_data) / ((double) (le64toh(f->header->data_hash_table_size) / sizeof(HashItem))));
7560fffc 2847
0284adc6 2848 if (JOURNAL_HEADER_CONTAINS(f->header, n_fields))
507f22bd 2849 printf("Field Objects: %"PRIu64"\n"
0284adc6 2850 "Field Hash Table Fill: %.1f%%\n",
507f22bd 2851 le64toh(f->header->n_fields),
0284adc6 2852 100.0 * (double) le64toh(f->header->n_fields) / ((double) (le64toh(f->header->field_hash_table_size) / sizeof(HashItem))));
3223f44f
LP
2853
2854 if (JOURNAL_HEADER_CONTAINS(f->header, n_tags))
507f22bd
ZJS
2855 printf("Tag Objects: %"PRIu64"\n",
2856 le64toh(f->header->n_tags));
3223f44f 2857 if (JOURNAL_HEADER_CONTAINS(f->header, n_entry_arrays))
507f22bd
ZJS
2858 printf("Entry Array Objects: %"PRIu64"\n",
2859 le64toh(f->header->n_entry_arrays));
a1a03e30
LP
2860
2861 if (fstat(f->fd, &st) >= 0)
59f448cf 2862 printf("Disk usage: %s\n", format_bytes(bytes, sizeof(bytes), (uint64_t) st.st_blocks * 512ULL));
7560fffc
LP
2863}
2864
fc68c929
LP
2865static int journal_file_warn_btrfs(JournalFile *f) {
2866 unsigned attrs;
2867 int r;
2868
2869 assert(f);
2870
2871 /* Before we write anything, check if the COW logic is turned
2872 * off on btrfs. Given our write pattern that is quite
2873 * unfriendly to COW file systems this should greatly improve
2874 * performance on COW file systems, such as btrfs, at the
2875 * expense of data integrity features (which shouldn't be too
2876 * bad, given that we do our own checksumming). */
2877
2878 r = btrfs_is_filesystem(f->fd);
2879 if (r < 0)
2880 return log_warning_errno(r, "Failed to determine if journal is on btrfs: %m");
2881 if (!r)
2882 return 0;
2883
2884 r = read_attr_fd(f->fd, &attrs);
2885 if (r < 0)
2886 return log_warning_errno(r, "Failed to read file attributes: %m");
2887
2888 if (attrs & FS_NOCOW_FL) {
2889 log_debug("Detected btrfs file system with copy-on-write disabled, all is good.");
2890 return 0;
2891 }
2892
2893 log_notice("Creating journal file %s on a btrfs file system, and copy-on-write is enabled. "
2894 "This is likely to slow down journal access substantially, please consider turning "
2895 "off the copy-on-write file attribute on the journal directory, using chattr +C.", f->path);
2896
2897 return 1;
2898}
2899
0284adc6
LP
2900int journal_file_open(
2901 const char *fname,
2902 int flags,
2903 mode_t mode,
2904 bool compress,
baed47c3 2905 bool seal,
0284adc6
LP
2906 JournalMetrics *metrics,
2907 MMapCache *mmap_cache,
b58c888f 2908 Set *deferred_closes,
0284adc6
LP
2909 JournalFile *template,
2910 JournalFile **ret) {
7560fffc 2911
fa6ac760 2912 bool newly_created = false;
0284adc6 2913 JournalFile *f;
fa6ac760 2914 void *h;
0284adc6 2915 int r;
7560fffc 2916
0284adc6 2917 assert(fname);
0559d3a5 2918 assert(ret);
7560fffc 2919
0284adc6
LP
2920 if ((flags & O_ACCMODE) != O_RDONLY &&
2921 (flags & O_ACCMODE) != O_RDWR)
2922 return -EINVAL;
7560fffc 2923
a0108012
LP
2924 if (!endswith(fname, ".journal") &&
2925 !endswith(fname, ".journal~"))
0284adc6 2926 return -EINVAL;
7560fffc 2927
0284adc6
LP
2928 f = new0(JournalFile, 1);
2929 if (!f)
2930 return -ENOMEM;
7560fffc 2931
0284adc6
LP
2932 f->fd = -1;
2933 f->mode = mode;
7560fffc 2934
0284adc6
LP
2935 f->flags = flags;
2936 f->prot = prot_from_flags(flags);
2937 f->writable = (flags & O_ACCMODE) != O_RDONLY;
92261977 2938#if defined(HAVE_LZ4)
d89c8fdf
ZJS
2939 f->compress_lz4 = compress;
2940#elif defined(HAVE_XZ)
2941 f->compress_xz = compress;
48b61739 2942#endif
49a32d43 2943#ifdef HAVE_GCRYPT
baed47c3 2944 f->seal = seal;
49a32d43 2945#endif
7560fffc 2946
0284adc6
LP
2947 if (mmap_cache)
2948 f->mmap = mmap_cache_ref(mmap_cache);
2949 else {
84168d80 2950 f->mmap = mmap_cache_new();
0284adc6
LP
2951 if (!f->mmap) {
2952 r = -ENOMEM;
2953 goto fail;
2954 }
2955 }
7560fffc 2956
0284adc6
LP
2957 f->path = strdup(fname);
2958 if (!f->path) {
2959 r = -ENOMEM;
2960 goto fail;
2961 }
7560fffc 2962
4743015d 2963 f->chain_cache = ordered_hashmap_new(&uint64_hash_ops);
a4bcff5b
LP
2964 if (!f->chain_cache) {
2965 r = -ENOMEM;
2966 goto fail;
2967 }
2968
0284adc6
LP
2969 f->fd = open(f->path, f->flags|O_CLOEXEC, f->mode);
2970 if (f->fd < 0) {
2971 r = -errno;
2972 goto fail;
7560fffc 2973 }
7560fffc 2974
2678031a
LP
2975 r = journal_file_fstat(f);
2976 if (r < 0)
0284adc6 2977 goto fail;
7560fffc 2978
0284adc6 2979 if (f->last_stat.st_size == 0 && f->writable) {
11689d2a 2980
fc68c929 2981 (void) journal_file_warn_btrfs(f);
11689d2a 2982
fb0951b0
LP
2983 /* Let's attach the creation time to the journal file,
2984 * so that the vacuuming code knows the age of this
2985 * file even if the file might end up corrupted one
2986 * day... Ideally we'd just use the creation time many
2987 * file systems maintain for each file, but there is
2988 * currently no usable API to query this, hence let's
2989 * emulate this via extended attributes. If extended
2990 * attributes are not supported we'll just skip this,
7517e174 2991 * and rely solely on mtime/atime/ctime of the file. */
fb0951b0 2992
d61b600d 2993 fd_setcrtime(f->fd, 0);
7560fffc 2994
feb12d3e 2995#ifdef HAVE_GCRYPT
0284adc6 2996 /* Try to load the FSPRG state, and if we can't, then
baed47c3 2997 * just don't do sealing */
49a32d43
LP
2998 if (f->seal) {
2999 r = journal_file_fss_load(f);
3000 if (r < 0)
3001 f->seal = false;
3002 }
feb12d3e 3003#endif
7560fffc 3004
0284adc6
LP
3005 r = journal_file_init_header(f, template);
3006 if (r < 0)
3007 goto fail;
7560fffc 3008
2678031a
LP
3009 r = journal_file_fstat(f);
3010 if (r < 0)
0284adc6 3011 goto fail;
fb0951b0
LP
3012
3013 newly_created = true;
0284adc6 3014 }
7560fffc 3015
0284adc6 3016 if (f->last_stat.st_size < (off_t) HEADER_SIZE_MIN) {
cfb571f3 3017 r = -ENODATA;
0284adc6
LP
3018 goto fail;
3019 }
7560fffc 3020
fa6ac760 3021 r = mmap_cache_get(f->mmap, f->fd, f->prot, CONTEXT_HEADER, true, 0, PAGE_ALIGN(sizeof(Header)), &f->last_stat, &h);
977eaa1e 3022 if (r < 0)
0284adc6 3023 goto fail;
7560fffc 3024
fa6ac760
LP
3025 f->header = h;
3026
0284adc6 3027 if (!newly_created) {
b58c888f
VC
3028 if (deferred_closes)
3029 journal_file_close_set(deferred_closes);
3030
0284adc6
LP
3031 r = journal_file_verify_header(f);
3032 if (r < 0)
3033 goto fail;
3034 }
7560fffc 3035
feb12d3e 3036#ifdef HAVE_GCRYPT
0284adc6 3037 if (!newly_created && f->writable) {
baed47c3 3038 r = journal_file_fss_load(f);
0284adc6
LP
3039 if (r < 0)
3040 goto fail;
3041 }
feb12d3e 3042#endif
cec736d2
LP
3043
3044 if (f->writable) {
4a92baf3
LP
3045 if (metrics) {
3046 journal_default_metrics(metrics, f->fd);
3047 f->metrics = *metrics;
3048 } else if (template)
3049 f->metrics = template->metrics;
3050
cec736d2
LP
3051 r = journal_file_refresh_header(f);
3052 if (r < 0)
3053 goto fail;
3054 }
3055
feb12d3e 3056#ifdef HAVE_GCRYPT
baed47c3 3057 r = journal_file_hmac_setup(f);
14d10188
LP
3058 if (r < 0)
3059 goto fail;
feb12d3e 3060#endif
14d10188 3061
cec736d2 3062 if (newly_created) {
de190aef 3063 r = journal_file_setup_field_hash_table(f);
cec736d2
LP
3064 if (r < 0)
3065 goto fail;
3066
de190aef 3067 r = journal_file_setup_data_hash_table(f);
cec736d2
LP
3068 if (r < 0)
3069 goto fail;
7560fffc 3070
feb12d3e 3071#ifdef HAVE_GCRYPT
7560fffc
LP
3072 r = journal_file_append_first_tag(f);
3073 if (r < 0)
3074 goto fail;
feb12d3e 3075#endif
cec736d2
LP
3076 }
3077
fa6ac760
LP
3078 if (mmap_cache_got_sigbus(f->mmap, f->fd)) {
3079 r = -EIO;
3080 goto fail;
3081 }
3082
7a24f3bf 3083 if (template && template->post_change_timer) {
e167d7fd
LP
3084 r = journal_file_enable_post_change_timer(
3085 f,
3086 sd_event_source_get_event(template->post_change_timer),
3087 template->post_change_timer_period);
7a24f3bf 3088
7a24f3bf
VC
3089 if (r < 0)
3090 goto fail;
3091 }
3092
0559d3a5 3093 *ret = f;
cec736d2
LP
3094 return 0;
3095
3096fail:
fa6ac760
LP
3097 if (f->fd >= 0 && mmap_cache_got_sigbus(f->mmap, f->fd))
3098 r = -EIO;
3099
69a3a6fd 3100 (void) journal_file_close(f);
cec736d2
LP
3101
3102 return r;
3103}
0ac38b70 3104
b58c888f 3105int journal_file_rotate(JournalFile **f, bool compress, bool seal, Set *deferred_closes) {
57535f47 3106 _cleanup_free_ char *p = NULL;
0ac38b70
LP
3107 size_t l;
3108 JournalFile *old_file, *new_file = NULL;
3109 int r;
3110
3111 assert(f);
3112 assert(*f);
3113
3114 old_file = *f;
3115
3116 if (!old_file->writable)
3117 return -EINVAL;
3118
3119 if (!endswith(old_file->path, ".journal"))
3120 return -EINVAL;
3121
3122 l = strlen(old_file->path);
57535f47
ZJS
3123 r = asprintf(&p, "%.*s@" SD_ID128_FORMAT_STR "-%016"PRIx64"-%016"PRIx64".journal",
3124 (int) l - 8, old_file->path,
3125 SD_ID128_FORMAT_VAL(old_file->header->seqnum_id),
3126 le64toh((*f)->header->head_entry_seqnum),
3127 le64toh((*f)->header->head_entry_realtime));
3128 if (r < 0)
0ac38b70
LP
3129 return -ENOMEM;
3130
2678031a
LP
3131 /* Try to rename the file to the archived version. If the file
3132 * already was deleted, we'll get ENOENT, let's ignore that
3133 * case. */
0ac38b70 3134 r = rename(old_file->path, p);
2678031a 3135 if (r < 0 && errno != ENOENT)
0ac38b70
LP
3136 return -errno;
3137
ccdbaf91 3138 old_file->header->state = STATE_ARCHIVED;
0ac38b70 3139
f27a3864
LP
3140 /* Currently, btrfs is not very good with out write patterns
3141 * and fragments heavily. Let's defrag our journal files when
3142 * we archive them */
3143 old_file->defrag_on_close = true;
3144
b58c888f
VC
3145 r = journal_file_open(old_file->path, old_file->flags, old_file->mode, compress, seal, NULL, old_file->mmap, deferred_closes, old_file, &new_file);
3146
3147 if (deferred_closes &&
3148 set_put(deferred_closes, old_file) >= 0)
3149 (void) journal_file_set_offline(old_file, false);
3150 else
3151 (void) journal_file_close(old_file);
0ac38b70
LP
3152
3153 *f = new_file;
3154 return r;
3155}
3156
9447a7f1
LP
3157int journal_file_open_reliably(
3158 const char *fname,
3159 int flags,
3160 mode_t mode,
7560fffc 3161 bool compress,
baed47c3 3162 bool seal,
4a92baf3 3163 JournalMetrics *metrics,
27370278 3164 MMapCache *mmap_cache,
b58c888f 3165 Set *deferred_closes,
9447a7f1
LP
3166 JournalFile *template,
3167 JournalFile **ret) {
3168
3169 int r;
3170 size_t l;
ed375beb 3171 _cleanup_free_ char *p = NULL;
9447a7f1 3172
b58c888f 3173 r = journal_file_open(fname, flags, mode, compress, seal, metrics, mmap_cache, deferred_closes, template, ret);
288359db
ZJS
3174 if (!IN_SET(r,
3175 -EBADMSG, /* corrupted */
3176 -ENODATA, /* truncated */
3177 -EHOSTDOWN, /* other machine */
3178 -EPROTONOSUPPORT, /* incompatible feature */
3179 -EBUSY, /* unclean shutdown */
3180 -ESHUTDOWN, /* already archived */
3181 -EIO, /* IO error, including SIGBUS on mmap */
3182 -EIDRM /* File has been deleted */))
9447a7f1
LP
3183 return r;
3184
3185 if ((flags & O_ACCMODE) == O_RDONLY)
3186 return r;
3187
3188 if (!(flags & O_CREAT))
3189 return r;
3190
7560fffc
LP
3191 if (!endswith(fname, ".journal"))
3192 return r;
3193
5c70eab4
LP
3194 /* The file is corrupted. Rotate it away and try it again (but only once) */
3195
9447a7f1 3196 l = strlen(fname);
d587eca5 3197 if (asprintf(&p, "%.*s@%016"PRIx64 "-%016"PRIx64 ".journal~",
57535f47 3198 (int) l - 8, fname,
d587eca5 3199 now(CLOCK_REALTIME),
9bf3b535 3200 random_u64()) < 0)
9447a7f1
LP
3201 return -ENOMEM;
3202
65089b82 3203 if (rename(fname, p) < 0)
9447a7f1
LP
3204 return -errno;
3205
f27a3864
LP
3206 /* btrfs doesn't cope well with our write pattern and
3207 * fragments heavily. Let's defrag all files we rotate */
11689d2a
LP
3208
3209 (void) chattr_path(p, false, FS_NOCOW_FL);
f27a3864
LP
3210 (void) btrfs_defrag(p);
3211
65089b82 3212 log_warning_errno(r, "File %s corrupted or uncleanly shut down, renaming and replacing.", fname);
9447a7f1 3213
b58c888f 3214 return journal_file_open(fname, flags, mode, compress, seal, metrics, mmap_cache, deferred_closes, template, ret);
9447a7f1
LP
3215}
3216
cf244689
LP
3217int journal_file_copy_entry(JournalFile *from, JournalFile *to, Object *o, uint64_t p, uint64_t *seqnum, Object **ret, uint64_t *offset) {
3218 uint64_t i, n;
3219 uint64_t q, xor_hash = 0;
3220 int r;
3221 EntryItem *items;
3222 dual_timestamp ts;
3223
3224 assert(from);
3225 assert(to);
3226 assert(o);
3227 assert(p);
3228
3229 if (!to->writable)
3230 return -EPERM;
3231
3232 ts.monotonic = le64toh(o->entry.monotonic);
3233 ts.realtime = le64toh(o->entry.realtime);
3234
cf244689 3235 n = journal_file_entry_n_items(o);
4faa7004
TA
3236 /* alloca() can't take 0, hence let's allocate at least one */
3237 items = alloca(sizeof(EntryItem) * MAX(1u, n));
cf244689
LP
3238
3239 for (i = 0; i < n; i++) {
4fd052ae
FC
3240 uint64_t l, h;
3241 le64_t le_hash;
cf244689
LP
3242 size_t t;
3243 void *data;
3244 Object *u;
3245
3246 q = le64toh(o->entry.items[i].object_offset);
3247 le_hash = o->entry.items[i].hash;
3248
3249 r = journal_file_move_to_object(from, OBJECT_DATA, q, &o);
3250 if (r < 0)
3251 return r;
3252
3253 if (le_hash != o->data.hash)
3254 return -EBADMSG;
3255
3256 l = le64toh(o->object.size) - offsetof(Object, data.payload);
3257 t = (size_t) l;
3258
3259 /* We hit the limit on 32bit machines */
3260 if ((uint64_t) t != l)
3261 return -E2BIG;
3262
d89c8fdf 3263 if (o->object.flags & OBJECT_COMPRESSION_MASK) {
3b1a55e1 3264#if defined(HAVE_XZ) || defined(HAVE_LZ4)
a7f7d1bd 3265 size_t rsize = 0;
cf244689 3266
d89c8fdf
ZJS
3267 r = decompress_blob(o->object.flags & OBJECT_COMPRESSION_MASK,
3268 o->data.payload, l, &from->compress_buffer, &from->compress_buffer_size, &rsize, 0);
3269 if (r < 0)
3270 return r;
cf244689
LP
3271
3272 data = from->compress_buffer;
3273 l = rsize;
3b1a55e1
ZJS
3274#else
3275 return -EPROTONOSUPPORT;
3276#endif
cf244689
LP
3277 } else
3278 data = o->data.payload;
3279
3280 r = journal_file_append_data(to, data, l, &u, &h);
3281 if (r < 0)
3282 return r;
3283
3284 xor_hash ^= le64toh(u->data.hash);
3285 items[i].object_offset = htole64(h);
3286 items[i].hash = u->data.hash;
3287
3288 r = journal_file_move_to_object(from, OBJECT_ENTRY, p, &o);
3289 if (r < 0)
3290 return r;
3291 }
3292
fa6ac760
LP
3293 r = journal_file_append_entry_internal(to, &ts, xor_hash, items, n, seqnum, ret, offset);
3294
3295 if (mmap_cache_got_sigbus(to->mmap, to->fd))
3296 return -EIO;
3297
3298 return r;
cf244689 3299}
babfc091 3300
8580d1f7
LP
3301void journal_reset_metrics(JournalMetrics *m) {
3302 assert(m);
3303
3304 /* Set everything to "pick automatic values". */
3305
3306 *m = (JournalMetrics) {
3307 .min_use = (uint64_t) -1,
3308 .max_use = (uint64_t) -1,
3309 .min_size = (uint64_t) -1,
3310 .max_size = (uint64_t) -1,
3311 .keep_free = (uint64_t) -1,
3312 .n_max_files = (uint64_t) -1,
3313 };
3314}
3315
babfc091 3316void journal_default_metrics(JournalMetrics *m, int fd) {
8580d1f7 3317 char a[FORMAT_BYTES_MAX], b[FORMAT_BYTES_MAX], c[FORMAT_BYTES_MAX], d[FORMAT_BYTES_MAX], e[FORMAT_BYTES_MAX];
babfc091 3318 struct statvfs ss;
8580d1f7 3319 uint64_t fs_size;
babfc091
LP
3320
3321 assert(m);
3322 assert(fd >= 0);
3323
3324 if (fstatvfs(fd, &ss) >= 0)
3325 fs_size = ss.f_frsize * ss.f_blocks;
8580d1f7
LP
3326 else {
3327 log_debug_errno(errno, "Failed to detremine disk size: %m");
3328 fs_size = 0;
3329 }
babfc091
LP
3330
3331 if (m->max_use == (uint64_t) -1) {
3332
3333 if (fs_size > 0) {
3334 m->max_use = PAGE_ALIGN(fs_size / 10); /* 10% of file system size */
3335
3336 if (m->max_use > DEFAULT_MAX_USE_UPPER)
3337 m->max_use = DEFAULT_MAX_USE_UPPER;
3338
3339 if (m->max_use < DEFAULT_MAX_USE_LOWER)
3340 m->max_use = DEFAULT_MAX_USE_LOWER;
3341 } else
3342 m->max_use = DEFAULT_MAX_USE_LOWER;
3343 } else {
3344 m->max_use = PAGE_ALIGN(m->max_use);
3345
8580d1f7 3346 if (m->max_use != 0 && m->max_use < JOURNAL_FILE_SIZE_MIN*2)
babfc091
LP
3347 m->max_use = JOURNAL_FILE_SIZE_MIN*2;
3348 }
3349
8580d1f7
LP
3350 if (m->min_use == (uint64_t) -1)
3351 m->min_use = DEFAULT_MIN_USE;
3352
3353 if (m->min_use > m->max_use)
3354 m->min_use = m->max_use;
3355
babfc091
LP
3356 if (m->max_size == (uint64_t) -1) {
3357 m->max_size = PAGE_ALIGN(m->max_use / 8); /* 8 chunks */
3358
3359 if (m->max_size > DEFAULT_MAX_SIZE_UPPER)
3360 m->max_size = DEFAULT_MAX_SIZE_UPPER;
3361 } else
3362 m->max_size = PAGE_ALIGN(m->max_size);
3363
8580d1f7
LP
3364 if (m->max_size != 0) {
3365 if (m->max_size < JOURNAL_FILE_SIZE_MIN)
3366 m->max_size = JOURNAL_FILE_SIZE_MIN;
babfc091 3367
8580d1f7
LP
3368 if (m->max_use != 0 && m->max_size*2 > m->max_use)
3369 m->max_use = m->max_size*2;
3370 }
babfc091
LP
3371
3372 if (m->min_size == (uint64_t) -1)
3373 m->min_size = JOURNAL_FILE_SIZE_MIN;
3374 else {
3375 m->min_size = PAGE_ALIGN(m->min_size);
3376
3377 if (m->min_size < JOURNAL_FILE_SIZE_MIN)
3378 m->min_size = JOURNAL_FILE_SIZE_MIN;
3379
8580d1f7 3380 if (m->max_size != 0 && m->min_size > m->max_size)
babfc091
LP
3381 m->max_size = m->min_size;
3382 }
3383
3384 if (m->keep_free == (uint64_t) -1) {
3385
3386 if (fs_size > 0) {
8621b110 3387 m->keep_free = PAGE_ALIGN(fs_size * 3 / 20); /* 15% of file system size */
babfc091
LP
3388
3389 if (m->keep_free > DEFAULT_KEEP_FREE_UPPER)
3390 m->keep_free = DEFAULT_KEEP_FREE_UPPER;
3391
3392 } else
3393 m->keep_free = DEFAULT_KEEP_FREE;
3394 }
3395
8580d1f7
LP
3396 if (m->n_max_files == (uint64_t) -1)
3397 m->n_max_files = DEFAULT_N_MAX_FILES;
3398
3399 log_debug("Fixed min_use=%s max_use=%s max_size=%s min_size=%s keep_free=%s n_max_files=%" PRIu64,
3400 format_bytes(a, sizeof(a), m->min_use),
3401 format_bytes(b, sizeof(b), m->max_use),
3402 format_bytes(c, sizeof(c), m->max_size),
3403 format_bytes(d, sizeof(d), m->min_size),
3404 format_bytes(e, sizeof(e), m->keep_free),
3405 m->n_max_files);
babfc091 3406}
08984293
LP
3407
3408int journal_file_get_cutoff_realtime_usec(JournalFile *f, usec_t *from, usec_t *to) {
08984293 3409 assert(f);
c88cc6af 3410 assert(f->header);
08984293
LP
3411 assert(from || to);
3412
3413 if (from) {
162566a4
LP
3414 if (f->header->head_entry_realtime == 0)
3415 return -ENOENT;
08984293 3416
162566a4 3417 *from = le64toh(f->header->head_entry_realtime);
08984293
LP
3418 }
3419
3420 if (to) {
162566a4
LP
3421 if (f->header->tail_entry_realtime == 0)
3422 return -ENOENT;
08984293 3423
162566a4 3424 *to = le64toh(f->header->tail_entry_realtime);
08984293
LP
3425 }
3426
3427 return 1;
3428}
3429
3430int journal_file_get_cutoff_monotonic_usec(JournalFile *f, sd_id128_t boot_id, usec_t *from, usec_t *to) {
08984293
LP
3431 Object *o;
3432 uint64_t p;
3433 int r;
3434
3435 assert(f);
3436 assert(from || to);
3437
47838ab3 3438 r = find_data_object_by_boot_id(f, boot_id, &o, &p);
08984293
LP
3439 if (r <= 0)
3440 return r;
3441
3442 if (le64toh(o->data.n_entries) <= 0)
3443 return 0;
3444
3445 if (from) {
3446 r = journal_file_move_to_object(f, OBJECT_ENTRY, le64toh(o->data.entry_offset), &o);
3447 if (r < 0)
3448 return r;
3449
3450 *from = le64toh(o->entry.monotonic);
3451 }
3452
3453 if (to) {
3454 r = journal_file_move_to_object(f, OBJECT_DATA, p, &o);
3455 if (r < 0)
3456 return r;
3457
3458 r = generic_array_get_plus_one(f,
3459 le64toh(o->data.entry_offset),
3460 le64toh(o->data.entry_array_offset),
3461 le64toh(o->data.n_entries)-1,
3462 &o, NULL);
3463 if (r <= 0)
3464 return r;
3465
3466 *to = le64toh(o->entry.monotonic);
3467 }
3468
3469 return 1;
3470}
dca6219e 3471
fb0951b0 3472bool journal_file_rotate_suggested(JournalFile *f, usec_t max_file_usec) {
dca6219e 3473 assert(f);
c88cc6af 3474 assert(f->header);
dca6219e
LP
3475
3476 /* If we gained new header fields we gained new features,
3477 * hence suggest a rotation */
361f9cbc
LP
3478 if (le64toh(f->header->header_size) < sizeof(Header)) {
3479 log_debug("%s uses an outdated header, suggesting rotation.", f->path);
dca6219e 3480 return true;
361f9cbc 3481 }
dca6219e
LP
3482
3483 /* Let's check if the hash tables grew over a certain fill
3484 * level (75%, borrowing this value from Java's hash table
3485 * implementation), and if so suggest a rotation. To calculate
3486 * the fill level we need the n_data field, which only exists
3487 * in newer versions. */
3488
3489 if (JOURNAL_HEADER_CONTAINS(f->header, n_data))
361f9cbc 3490 if (le64toh(f->header->n_data) * 4ULL > (le64toh(f->header->data_hash_table_size) / sizeof(HashItem)) * 3ULL) {
507f22bd 3491 log_debug("Data hash table of %s has a fill level at %.1f (%"PRIu64" of %"PRIu64" items, %llu file size, %"PRIu64" bytes per hash table item), suggesting rotation.",
361f9cbc
LP
3492 f->path,
3493 100.0 * (double) le64toh(f->header->n_data) / ((double) (le64toh(f->header->data_hash_table_size) / sizeof(HashItem))),
507f22bd
ZJS
3494 le64toh(f->header->n_data),
3495 le64toh(f->header->data_hash_table_size) / sizeof(HashItem),
3496 (unsigned long long) f->last_stat.st_size,
3497 f->last_stat.st_size / le64toh(f->header->n_data));
dca6219e 3498 return true;
361f9cbc 3499 }
dca6219e
LP
3500
3501 if (JOURNAL_HEADER_CONTAINS(f->header, n_fields))
361f9cbc 3502 if (le64toh(f->header->n_fields) * 4ULL > (le64toh(f->header->field_hash_table_size) / sizeof(HashItem)) * 3ULL) {
507f22bd 3503 log_debug("Field hash table of %s has a fill level at %.1f (%"PRIu64" of %"PRIu64" items), suggesting rotation.",
361f9cbc
LP
3504 f->path,
3505 100.0 * (double) le64toh(f->header->n_fields) / ((double) (le64toh(f->header->field_hash_table_size) / sizeof(HashItem))),
507f22bd
ZJS
3506 le64toh(f->header->n_fields),
3507 le64toh(f->header->field_hash_table_size) / sizeof(HashItem));
dca6219e 3508 return true;
361f9cbc 3509 }
dca6219e 3510
0598fd4a
LP
3511 /* Are the data objects properly indexed by field objects? */
3512 if (JOURNAL_HEADER_CONTAINS(f->header, n_data) &&
3513 JOURNAL_HEADER_CONTAINS(f->header, n_fields) &&
3514 le64toh(f->header->n_data) > 0 &&
3515 le64toh(f->header->n_fields) == 0)
3516 return true;
3517
fb0951b0
LP
3518 if (max_file_usec > 0) {
3519 usec_t t, h;
3520
3521 h = le64toh(f->header->head_entry_realtime);
3522 t = now(CLOCK_REALTIME);
3523
3524 if (h > 0 && t > h + max_file_usec)
3525 return true;
3526 }
3527
dca6219e
LP
3528 return false;
3529}